diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index ee575e882c9..9b76ba740dd 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -105,7 +105,7 @@ void emit_instruction(asm_context& ctx, std::vector& out, Instruction* encoding |= !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc) ? instr->definitions[0].physReg() << 16 : - !instr->operands.empty() && !(instr->operands[0].physReg() == scc) ? + !instr->operands.empty() && instr->operands[0].physReg() <= 127 ? instr->operands[0].physReg() << 16 : 0; encoding |= sopk->imm; out.push_back(encoding); diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index b094340b02f..63fd36f3724 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -647,6 +647,61 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), els, cond)); } +void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val, + aco_opcode op, uint32_t undo) +{ + /* multiply by 16777216 to handle denormals */ + Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(s2)), + as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4)))); + Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val); + scaled = bld.vop1(op, bld.def(v1), scaled); + scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled); + + Temp not_scaled = bld.vop1(op, bld.def(v1), val); + + bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal); +} + +void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val) +{ + if (ctx->block->fp_mode.denorm32 == 0) { + bld.vop1(aco_opcode::v_rcp_f32, dst, val); + return; + } + + emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u); +} + +void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val) +{ + if (ctx->block->fp_mode.denorm32 == 0) { + bld.vop1(aco_opcode::v_rsq_f32, dst, val); + return; + } + + emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u); +} + +void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val) +{ + if (ctx->block->fp_mode.denorm32 == 0) { + bld.vop1(aco_opcode::v_sqrt_f32, dst, val); + return; + } + + emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u); +} + +void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val) +{ + if (ctx->block->fp_mode.denorm32 == 0) { + bld.vop1(aco_opcode::v_log_f32, dst, val); + return; + } + + emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u); +} + void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) { if (!instr->dest.dest.is_ssa) { @@ -1399,7 +1454,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_frsq: { if (dst.size() == 1) { - emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f32, dst); + emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0])); } else if (dst.size() == 2) { emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst); } else { @@ -1412,8 +1467,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fneg: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.size() == 1) { + if (ctx->block->fp_mode.must_flush_denorms32) + src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src)); bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src)); } else if (dst.size() == 2) { + if (ctx->block->fp_mode.must_flush_denorms16_64) + src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src)); Temp upper = bld.tmp(v1), lower = bld.tmp(v1); bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper); @@ -1428,8 +1487,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fabs: { Temp src = get_alu_src(ctx, instr->src[0]); if (dst.size() == 1) { + if (ctx->block->fp_mode.must_flush_denorms32) + src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src)); bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src)); } else if (dst.size() == 2) { + if (ctx->block->fp_mode.must_flush_denorms16_64) + src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src)); Temp upper = bld.tmp(v1), lower = bld.tmp(v1); bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper); @@ -1458,7 +1521,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_flog2: { if (dst.size() == 1) { - emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f32, dst); + emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0])); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); @@ -1468,7 +1531,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_frcp: { if (dst.size() == 1) { - emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f32, dst); + emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0])); } else if (dst.size() == 2) { emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst); } else { @@ -1490,7 +1553,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_fsqrt: { if (dst.size() == 1) { - emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f32, dst); + emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0])); } else if (dst.size() == 2) { emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst); } else { @@ -2040,8 +2103,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src0 = bld.tmp(v1); Temp src1 = bld.tmp(v1); bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src); - bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1); - + if (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz) + bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1); + else + bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst), + bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0), + bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1)); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); @@ -2074,7 +2141,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) break; } case nir_op_fquantize2f16: { - Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), get_alu_src(ctx, instr->src[0])); + Temp src = get_alu_src(ctx, instr->src[0]); + Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src); Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */ @@ -2083,7 +2151,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16); - bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res); + if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) { + Temp copysign_0 = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src)); + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res); + } else { + bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res); + } break; } case nir_op_bfm: { @@ -7593,6 +7666,56 @@ void handle_bc_optimize(isel_context *ctx) } } +void setup_fp_mode(isel_context *ctx, nir_shader *shader) +{ + Program *program = ctx->program; + + unsigned float_controls = shader->info.float_controls_execution_mode; + + program->next_fp_mode.preserve_signed_zero_inf_nan32 = + float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32; + program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = + float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 | + FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64); + + program->next_fp_mode.must_flush_denorms32 = + float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32; + program->next_fp_mode.must_flush_denorms16_64 = + float_controls & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | + FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64); + + program->next_fp_mode.care_about_round32 = + float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32); + + program->next_fp_mode.care_about_round16_64 = + float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 | + FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64); + + /* default to preserving fp16 and fp64 denorms, since it's free */ + if (program->next_fp_mode.must_flush_denorms16_64) + program->next_fp_mode.denorm16_64 = 0; + else + program->next_fp_mode.denorm16_64 = fp_denorm_keep; + + /* preserving fp32 denorms is expensive, so only do it if asked */ + if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) + program->next_fp_mode.denorm32 = fp_denorm_keep; + else + program->next_fp_mode.denorm32 = 0; + + if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) + program->next_fp_mode.round32 = fp_round_tz; + else + program->next_fp_mode.round32 = fp_round_ne; + + if (float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64)) + program->next_fp_mode.round16_64 = fp_round_tz; + else + program->next_fp_mode.round16_64 = fp_round_ne; + + ctx->block->fp_mode = program->next_fp_mode; +} + void select_program(Program *program, unsigned shader_count, struct nir_shader *const *shaders, @@ -7606,6 +7729,8 @@ void select_program(Program *program, nir_shader *nir = shaders[i]; init_context(&ctx, nir); + setup_fp_mode(&ctx, nir); + if (!i) { add_startpgm(&ctx); /* needs to be after init_context() for FS */ append_logical_start(ctx.block); @@ -7648,6 +7773,8 @@ void select_program(Program *program, ralloc_free(ctx.divergent_vals); } + program->config->float_mode = program->blocks[0].fp_mode.val; + append_logical_end(ctx.block); ctx.block->kind |= block_kind_uniform; Builder bld(ctx.program, ctx.block); diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index cdc8103497b..807ce746868 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -1360,7 +1360,6 @@ setup_isel_context(Program* program, scratch_size = std::max(scratch_size, shaders[i]->scratch_size); ctx.scratch_enabled = scratch_size > 0; ctx.program->config->scratch_bytes_per_wave = align(scratch_size * ctx.program->wave_size, 1024); - ctx.program->config->float_mode = V_00B028_FP_64_DENORMS; ctx.block = ctx.program->create_and_insert_block(); ctx.block->loop_nest_depth = 0; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index a6fe846c74d..59e77feffe5 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -110,6 +110,53 @@ enum barrier_interaction { barrier_count = 4, }; +enum fp_round { + fp_round_ne = 0, + fp_round_pi = 1, + fp_round_ni = 2, + fp_round_tz = 3, +}; + +enum fp_denorm { + /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and + * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */ + fp_denorm_flush = 0x0, + fp_denorm_keep = 0x3, +}; + +struct float_mode { + /* matches encoding of the MODE register */ + union { + struct { + fp_round round32:2; + fp_round round16_64:2; + unsigned denorm32:2; + unsigned denorm16_64:2; + }; + uint8_t val = 0; + }; + /* if false, optimizations which may remove infs/nan/-0.0 can be done */ + bool preserve_signed_zero_inf_nan32:1; + bool preserve_signed_zero_inf_nan16_64:1; + /* if false, optimizations which may remove denormal flushing can be done */ + bool must_flush_denorms32:1; + bool must_flush_denorms16_64:1; + bool care_about_round32:1; + bool care_about_round16_64:1; + + /* Returns true if instructions using the mode "other" can safely use the + * current one instead. */ + bool canReplace(float_mode other) const noexcept { + return val == other.val && + (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) && + (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) && + (must_flush_denorms32 || !other.must_flush_denorms32) && + (must_flush_denorms16_64 || !other.must_flush_denorms16_64) && + (care_about_round32 || !other.care_about_round32) && + (care_about_round16_64 || !other.care_about_round16_64); + } +}; + constexpr Format asVOP3(Format format) { return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format); }; @@ -1019,6 +1066,7 @@ struct RegisterDemand { /* CFG */ struct Block { + float_mode fp_mode; unsigned index; unsigned offset = 0; std::vector> instructions; @@ -1086,6 +1134,7 @@ static constexpr Stage geometry_gs = sw_gs | hw_gs; class Program final { public: + float_mode next_fp_mode; std::vector blocks; RegisterDemand max_reg_demand = RegisterDemand(); uint16_t num_waves = 0; @@ -1133,11 +1182,13 @@ public: Block* create_and_insert_block() { blocks.emplace_back(blocks.size()); + blocks.back().fp_mode = next_fp_mode; return &blocks.back(); } Block* insert_block(Block&& block) { block.index = blocks.size(); + block.fp_mode = next_fp_mode; blocks.emplace_back(std::move(block)); return &blocks.back(); } diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 1502619b9db..8db54064202 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -592,6 +592,22 @@ void lower_to_hw_instr(Program* program) ctx.program = program; Builder bld(program, &ctx.instructions); + bool set_mode = i == 0 && block->fp_mode.val != program->config->float_mode; + for (unsigned pred : block->linear_preds) { + if (program->blocks[pred].fp_mode.val != block->fp_mode.val) { + set_mode = true; + break; + } + } + if (set_mode) { + /* only allow changing modes at top-level blocks so this doesn't break + * the "jump over empty blocks" optimization */ + assert(block->kind & block_kind_top_level); + uint32_t mode = block->fp_mode.val; + /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */ + bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand(mode), (7 << 11) | 1); + } + for (size_t j = 0; j < block->instructions.size(); j++) { aco_ptr& instr = block->instructions[j]; aco_ptr mov; diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp index 803249637d5..40823da3c36 100644 --- a/src/amd/compiler/aco_opt_value_numbering.cpp +++ b/src/amd/compiler/aco_opt_value_numbering.cpp @@ -303,7 +303,8 @@ void process_block(vn_ctx& ctx, Block& block) Instruction* orig_instr = res.first->first; assert(instr->definitions.size() == orig_instr->definitions.size()); /* check if the original instruction dominates the current one */ - if (dominates(ctx, res.first->second, block.index)) { + if (dominates(ctx, res.first->second, block.index) && + ctx.program->blocks[res.first->second].fp_mode.canReplace(block.fp_mode)) { for (unsigned i = 0; i < instr->definitions.size(); i++) { assert(instr->definitions[i].regClass() == orig_instr->definitions[i].regClass()); ctx.renames[instr->definitions[i].tempId()] = orig_instr->definitions[i].getTemp(); diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 5b4fcf75126..7b66aa1eeb3 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -548,7 +548,7 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp return false; } -void label_instruction(opt_ctx &ctx, aco_ptr& instr) +void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) { if (instr->isSALU() || instr->isVALU() || instr->format == Format::PSEUDO) { ASSERTED bool all_const = false; @@ -888,7 +888,8 @@ void label_instruction(opt_ctx &ctx, aco_ptr& instr) ctx.info[instr->operands[i].tempId()].set_omod4(); } else if (instr->operands[!i].constantValue() == 0x3f000000) { /* 0.5 */ ctx.info[instr->operands[i].tempId()].set_omod5(); - } else if (instr->operands[!i].constantValue() == 0x3f800000) { /* 1.0 */ + } else if (instr->operands[!i].constantValue() == 0x3f800000 && + !block.fp_mode.must_flush_denorms32) { /* 1.0 */ ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[i].getTemp()); } else { continue; @@ -1892,7 +1893,7 @@ void apply_sgprs(opt_ctx &ctx, aco_ptr& instr) } } -bool apply_omod_clamp(opt_ctx &ctx, aco_ptr& instr) +bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr& instr) { /* check if we could apply omod on predecessor */ if (instr->opcode == aco_opcode::v_mul_f32) { @@ -1959,18 +1960,21 @@ bool apply_omod_clamp(opt_ctx &ctx, aco_ptr& instr) } } + /* omod has no effect if denormals are enabled */ + bool can_use_omod = block.fp_mode.denorm32 == 0; + /* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */ if (!instr->definitions.empty() && ctx.uses[instr->definitions[0].tempId()] == 1 && can_use_VOP3(instr) && instr_info.can_use_output_modifiers[(int)instr->opcode]) { - if(ctx.info[instr->definitions[0].tempId()].is_omod2()) { + if (can_use_omod && ctx.info[instr->definitions[0].tempId()].is_omod2()) { to_VOP3(ctx, instr); static_cast(instr.get())->omod = 1; ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get()); - } else if (ctx.info[instr->definitions[0].tempId()].is_omod4()) { + } else if (can_use_omod && ctx.info[instr->definitions[0].tempId()].is_omod4()) { to_VOP3(ctx, instr); static_cast(instr.get())->omod = 2; ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get()); - } else if (ctx.info[instr->definitions[0].tempId()].is_omod5()) { + } else if (can_use_omod && ctx.info[instr->definitions[0].tempId()].is_omod5()) { to_VOP3(ctx, instr); static_cast(instr.get())->omod = 3; ctx.info[instr->definitions[0].tempId()].set_omod_success(instr.get()); @@ -1987,7 +1991,7 @@ bool apply_omod_clamp(opt_ctx &ctx, aco_ptr& instr) // TODO: we could possibly move the whole label_instruction pass to combine_instruction: // this would mean that we'd have to fix the instruction uses while value propagation -void combine_instruction(opt_ctx &ctx, aco_ptr& instr) +void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) { if (instr->definitions.empty() || !ctx.uses[instr->definitions[0].tempId()]) return; @@ -1995,7 +1999,7 @@ void combine_instruction(opt_ctx &ctx, aco_ptr& instr) if (instr->isVALU()) { if (can_apply_sgprs(instr)) apply_sgprs(ctx, instr); - if (apply_omod_clamp(ctx, instr)) + if (apply_omod_clamp(ctx, block, instr)) return; } @@ -2048,9 +2052,11 @@ void combine_instruction(opt_ctx &ctx, aco_ptr& instr) return; } /* combine mul+add -> mad */ - else if (instr->opcode == aco_opcode::v_add_f32 || - instr->opcode == aco_opcode::v_sub_f32 || - instr->opcode == aco_opcode::v_subrev_f32) { + else if ((instr->opcode == aco_opcode::v_add_f32 || + instr->opcode == aco_opcode::v_sub_f32 || + instr->opcode == aco_opcode::v_subrev_f32) && + block.fp_mode.denorm32 == 0 && !block.fp_mode.preserve_signed_zero_inf_nan32) { + //TODO: we could use fma instead when denormals are enabled if the NIR isn't marked as precise uint32_t uses_src0 = UINT32_MAX; uint32_t uses_src1 = UINT32_MAX; @@ -2394,7 +2400,7 @@ void optimize(Program* program) /* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */ for (Block& block : program->blocks) { for (aco_ptr& instr : block.instructions) - label_instruction(ctx, instr); + label_instruction(ctx, block, instr); } ctx.uses = std::move(dead_code_analysis(program)); @@ -2402,7 +2408,7 @@ void optimize(Program* program) /* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */ for (Block& block : program->blocks) { for (aco_ptr& instr : block.instructions) - combine_instruction(ctx, instr); + combine_instruction(ctx, block, instr); } /* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */ diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 4775609629f..b561980c123 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -1557,6 +1557,8 @@ void radv_GetPhysicalDeviceProperties2( * support for changing the register. The same logic * applies for the rounding modes because they are * configured with the same config register. + * TODO: we can enable a lot of these for ACO when it + * supports all stages */ properties->shaderDenormFlushToZeroFloat32 = true; properties->shaderDenormPreserveFloat32 = false; diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py index 587e9820844..a4983ba0f61 100644 --- a/src/amd/vulkan/radv_extensions.py +++ b/src/amd/vulkan/radv_extensions.py @@ -89,7 +89,7 @@ EXTENSIONS = [ Extension('VK_KHR_shader_atomic_int64', 1, 'LLVM_VERSION_MAJOR >= 9'), Extension('VK_KHR_shader_clock', 1, True), Extension('VK_KHR_shader_draw_parameters', 1, True), - Extension('VK_KHR_shader_float_controls', 1, '!device->use_aco'), + Extension('VK_KHR_shader_float_controls', 1, True), Extension('VK_KHR_shader_float16_int8', 1, '!device->use_aco'), Extension('VK_KHR_spirv_1_4', 1, True), Extension('VK_KHR_storage_buffer_storage_class', 1, True),