From 90faadae72280f0da4abef016889bfbdb6a4d3d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Tue, 18 Feb 2025 11:24:00 +0100 Subject: [PATCH] aco/insert_exec_mask: don't disable dead quads on demote in divergent CF Also force-enalbe helpers in case of demote in divergent CF. Totals from 1305 (1.64% of 79377) affected shaders: (Navi31) Instrs: 926923 -> 922516 (-0.48%); split: -0.48%, +0.00% CodeSize: 5045292 -> 5027408 (-0.35%); split: -0.36%, +0.00% Latency: 6176577 -> 6174708 (-0.03%); split: -0.03%, +0.00% InvThroughput: 931603 -> 931583 (-0.00%); split: -0.00%, +0.00% SClause: 22816 -> 22855 (+0.17%); split: -0.17%, +0.34% Copies: 57347 -> 55170 (-3.80%); split: -3.81%, +0.01% Branches: 18990 -> 18974 (-0.08%) PreSGPRs: 42734 -> 43248 (+1.20%) SALU: 90511 -> 86153 (-4.81%); split: -4.85%, +0.04% Part-of: --- src/amd/compiler/aco_insert_exec_mask.cpp | 54 +++++++++---------- .../compiler/aco_instruction_selection.cpp | 6 +-- 2 files changed, 27 insertions(+), 33 deletions(-) diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 831f6a0e04a..482d3401b54 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -510,55 +510,49 @@ process_instructions(exec_ctx& ctx, Block* block, std::vectordefinitions[1] = bld.def(s1, scc); } } else if (instr->opcode == aco_opcode::p_demote_to_helper) { + assert(!ctx.handle_wqm || state == WQM); assert((info.exec[0].type & mask_type_exact) && (info.exec[0].type & mask_type_global)); - const bool nested_cf = !(info.exec.back().type & mask_type_global); - if (ctx.handle_wqm && state == Exact && nested_cf) { - /* Transition back to WQM without extra instruction. */ - info.exec.pop_back(); - state = WQM; - } else if (block->instructions[idx + 1]->opcode == aco_opcode::p_end_wqm) { + if (block->instructions[idx + 1]->opcode == aco_opcode::p_end_wqm) { /* Transition to Exact without extra instruction. */ info.exec.resize(1); state = Exact; - } else if (nested_cf) { - /* Save curent exec temporarily. */ - info.exec.back().op = bld.copy(bld.def(bld.lm), Operand(exec, bld.lm)); } else { + /* Make sure to not use some previously stored temporary. */ info.exec.back().op = Operand(exec, bld.lm); } /* Remove invocations from global exact mask. */ - Definition def = state == Exact ? Definition(exec, bld.lm) : bld.def(bld.lm); Operand src = instr->operands[0].isConstant() ? Operand(exec, bld.lm) : instr->operands[0]; + Operand exit_cond = Operand(exec, bld.lm); - bld.sop2(Builder::s_andn2, def, bld.def(s1, scc), info.exec[0].op, src); - info.exec[0].op = def.isTemp() ? Operand(def.getTemp()) : Operand(exec, bld.lm); + if (state == Exact) { + assert(info.exec.size() == 1); + bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), info.exec[0].op, + src); + } else { + Temp cond = bld.tmp(s1); + info.exec[0].op = bld.sop2(Builder::s_andn2, bld.def(bld.lm), Definition(cond, scc), + info.exec[0].op, src); - /* Update global WQM mask and store in exec. */ - if (state == WQM) { - assert(info.exec.size() > 1); - bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), def.getTemp()); + /* Update global WQM mask and store in exec. */ + if (info.exec.back().type & mask_type_global) { + assert(info.exec.size() == 2); + bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), + info.exec[0].op); + } else { + /* Conditionally set exec=0. Note, that exec might already be zero, so don't use s_branch_execz. */ + bld.sop2(Builder::s_cselect, Definition(exec, bld.lm), Operand(exec, bld.lm), + Operand::zero(bld.lm.bytes()), bld.scc(cond)); + exit_cond = Operand(cond, scc); + } } /* End shader if global mask is zero. */ instr->opcode = aco_opcode::p_exit_early_if_not; - instr->operands[0] = Operand(exec, bld.lm); + instr->operands[0] = exit_cond; bld.insert(std::move(instr)); - /* Update all other exec masks. */ - if (nested_cf) { - const unsigned global_idx = state == WQM ? 1 : 0; - for (unsigned i = global_idx + 1; i < info.exec.size() - 1; i++) { - info.exec[i].op = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), - info.exec[i].op, Operand(exec, bld.lm)); - } - /* Update current exec and save WQM mask. */ - info.exec[global_idx].op = - bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), - Definition(exec, bld.lm), info.exec.back().op, Operand(exec, bld.lm)); - info.exec.back().op = Operand(exec, bld.lm); - } continue; } else if (instr->opcode == aco_opcode::p_elect) { diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index dfc9ba63409..c9f17d0dbf7 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -8631,11 +8631,11 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) bld.pseudo(aco_opcode::p_demote_to_helper, cond); - /* Perform the demote in WQM so that it doesn't make exec empty. WQM should last until at - * least the next top-level block. + /* Perform the demote in WQM so that it doesn't make exec empty. + * WQM should last until at least the next top-level block. */ if (ctx->cf_info.in_divergent_cf) - set_wqm(ctx); + set_wqm(ctx, true); ctx->block->kind |= block_kind_uses_discard; ctx->program->needs_exact = true;