From 7d552d71e94b3080eb569b3fa6763ab905d98b7a Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 24 Sep 2025 16:16:43 +0100 Subject: [PATCH] ac/nir: optimize txd(coord, ddx/ddy(coord)) This is done in ac_nir_lower_tex so that we can optimize derivative calculations with a different exec mask than the texture sample by using the nir_strict_wqm_coord_amd path. It's also more aware of divergence than nir_lower_tex is. fossil-db (gfx1201): Totals from 103 (0.13% of 79839) affected shaders: MaxWaves: 2610 -> 2620 (+0.38%) Instrs: 347283 -> 345912 (-0.39%); split: -0.40%, +0.00% CodeSize: 1892380 -> 1883824 (-0.45%); split: -0.46%, +0.00% VGPRs: 8028 -> 7824 (-2.54%) Latency: 3942575 -> 3939623 (-0.07%); split: -0.08%, +0.01% InvThroughput: 867147 -> 865281 (-0.22%); split: -0.24%, +0.02% VClause: 6230 -> 6221 (-0.14%); split: -0.19%, +0.05% SClause: 3910 -> 3914 (+0.10%); split: -0.26%, +0.36% Copies: 16091 -> 15721 (-2.30%); split: -2.74%, +0.44% PreSGPRs: 4651 -> 4658 (+0.15%) PreVGPRs: 6389 -> 6320 (-1.08%); split: -1.17%, +0.09% VALU: 228715 -> 227490 (-0.54%); split: -0.54%, +0.01% SALU: 32763 -> 32767 (+0.01%); split: -0.06%, +0.07% VMEM: 9027 -> 9024 (-0.03%) Signed-off-by: Rhys Perry Reviewed-by: Georg Lehmann Part-of: --- src/amd/common/nir/ac_nir_lower_tex.c | 107 +++++++++++++++++++++----- 1 file changed, 89 insertions(+), 18 deletions(-) diff --git a/src/amd/common/nir/ac_nir_lower_tex.c b/src/amd/common/nir/ac_nir_lower_tex.c index 19d0ced5c7b..2588fc086f9 100644 --- a/src/amd/common/nir/ac_nir_lower_tex.c +++ b/src/amd/common/nir/ac_nir_lower_tex.c @@ -221,12 +221,20 @@ typedef struct { nir_intrinsic_instr *load; } coord_info; -static bool -can_move_coord(nir_scalar scalar, coord_info *info) +static bool can_move_coord(nir_scalar scalar, coord_info *info, nir_block *toplevel_block, bool txd) { if (scalar.def->bit_size != 32) return false; + /* Allow any def that is reachable from the nir_strict_wqm_coord_amd when + * optimizing nir_texop_txd. Otherwise, we only use nir_strict_wqm_coord_amd + * for cases that D3D11 requires. + */ + if (txd && nir_block_dominates(scalar.def->parent_instr->block, toplevel_block)) { + info->load = NULL; + return true; + } + if (nir_scalar_is_const(scalar)) return true; @@ -273,7 +281,8 @@ struct move_tex_coords_state { struct loop_if_state { bool inside_loop; - bool divergent_discard; + unsigned prev_terminate; + unsigned prev_break_continue; }; static nir_def * @@ -284,6 +293,9 @@ build_coordinate(struct move_tex_coords_state *state, nir_scalar scalar, coord_i if (nir_scalar_is_const(scalar)) return nir_imm_intN_t(b, nir_scalar_as_uint(scalar), scalar.def->bit_size); + if (!info.load) + return nir_mov_scalar(b, scalar); + ASSERTED nir_src offset = *nir_get_io_offset_src(info.load); assert(nir_src_is_const(offset) && !nir_src_as_uint(offset)); @@ -304,11 +316,48 @@ build_coordinate(struct move_tex_coords_state *state, nir_scalar scalar, coord_i return res; } +static bool can_optimize_txd(nir_shader *shader, struct loop_if_state *loop_if, nir_tex_instr *tex, + bool *need_strict_wqm_coord) +{ + nir_instr *ddxy_instrs[NIR_MAX_VEC_COMPONENTS * 2]; + unsigned size = nir_tex_parse_txd_coords(shader, tex, ddxy_instrs); + if (!size) + return false; + + bool incomplete_quad = + tex->instr.block->divergent || loop_if->prev_terminate || loop_if->inside_loop; + + *need_strict_wqm_coord = false; + if (incomplete_quad) { + for (unsigned i = 0; i < size; i++) { + nir_instr *instr = ddxy_instrs[i]; + *need_strict_wqm_coord |= + instr->block->cf_node.parent != tex->instr.block->cf_node.parent || + loop_if->prev_terminate > instr->index || loop_if->prev_break_continue > instr->index; + } + } + + return true; +} + +static bool optimize_txd(nir_tex_instr *tex) +{ + if (tex->op == nir_texop_txd) { + tex->op = nir_texop_tex; + nir_tex_instr_remove_src(tex, nir_tex_instr_src_index(tex, nir_tex_src_ddx)); + nir_tex_instr_remove_src(tex, nir_tex_instr_src_index(tex, nir_tex_src_ddy)); + return true; + } + + return false; +} + static bool move_tex_coords(struct move_tex_coords_state *state, nir_function_impl *impl, nir_instr *instr) { nir_tex_instr *tex = nir_instr_as_tex(instr); - if (tex->op != nir_texop_tex && tex->op != nir_texop_txb && tex->op != nir_texop_lod) + if (tex->op != nir_texop_tex && tex->op != nir_texop_txb && tex->op != nir_texop_lod && + tex->op != nir_texop_txd) return false; switch (tex->sampler_dim) { @@ -333,9 +382,11 @@ move_tex_coords(struct move_tex_coords_state *state, nir_function_impl *impl, ni nir_scalar components[NIR_MAX_VEC_COMPONENTS]; coord_info infos[NIR_MAX_VEC_COMPONENTS]; bool can_move_all = true; + nir_block *toplevel_block = nir_cursor_current_block(state->toplevel_b.cursor); for (unsigned i = 0; i < tex->coord_components; i++) { components[i] = nir_scalar_resolved(src->src.ssa, i); - can_move_all &= can_move_coord(components[i], &infos[i]); + can_move_all &= + can_move_coord(components[i], &infos[i], toplevel_block, tex->op == nir_texop_txd); } if (!can_move_all) return false; @@ -377,6 +428,8 @@ move_tex_coords(struct move_tex_coords_state *state, nir_function_impl *impl, ni if (offset_src >= 0) /* Workaround requirement in nir_tex_instr_src_size(). */ tex->src[offset_src].src_type = nir_tex_src_backend2; + optimize_txd(tex); + state->num_wqm_vgprs += linear_vgpr_size; return true; @@ -391,7 +444,7 @@ move_ddxy(struct move_tex_coords_state *state, nir_function_impl *impl, nir_intr bool can_move_all = true; for (unsigned i = 0; i < num_components; i++) { components[i] = nir_scalar_resolved(instr->src[0].ssa, i); - can_move_all &= can_move_coord(components[i], &infos[i]); + can_move_all &= can_move_coord(components[i], &infos[i], NULL, false); } if (!can_move_all || state->num_wqm_vgprs + num_components > state->options->max_wqm_vgprs) return false; @@ -415,6 +468,7 @@ static bool move_coords_from_divergent_cf(struct move_tex_coords_state *state, struct loop_if_state *loop_if, struct exec_list *cf_list) { nir_function_impl *impl = state->toplevel_b.impl; + nir_shader *shader = impl->function->shader; bool progress = false; foreach_list_typed (nir_cf_node, cf_node, node, cf_list) { @@ -425,27 +479,38 @@ static bool move_coords_from_divergent_cf(struct move_tex_coords_state *state, bool top_level = cf_list == &impl->body; nir_foreach_instr (instr, block) { - if (top_level && !loop_if->divergent_discard) + if (top_level && !loop_if->prev_terminate) state->toplevel_b.cursor = nir_before_instr(instr); /* Assume quads might be incomplete when inside loops in case of a * divergent terminate from a previous iteration. */ bool incomplete_quad = - block->divergent || loop_if->divergent_discard || loop_if->inside_loop; + block->divergent || loop_if->prev_terminate || loop_if->inside_loop; - if (instr->type == nir_instr_type_tex && incomplete_quad) { - progress |= move_tex_coords(state, impl, instr); + if (instr->type == nir_instr_type_tex) { + nir_tex_instr *tex = nir_instr_as_tex(instr); + + if (tex->op == nir_texop_txd) { + bool txd_need_strict_wqm_coord = false; + if (!can_optimize_txd(shader, loop_if, tex, &txd_need_strict_wqm_coord)) + continue; + if (!txd_need_strict_wqm_coord) + progress |= optimize_txd(tex); + } + + if (state->options->fix_derivs_in_divergent_cf && incomplete_quad) + progress |= move_tex_coords(state, impl, instr); } else if (instr->type == nir_instr_type_intrinsic) { nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); switch (intrin->intrinsic) { case nir_intrinsic_terminate: if (block->divergent) - loop_if->divergent_discard = true; + loop_if->prev_terminate = instr->index; break; case nir_intrinsic_terminate_if: if (block->divergent || nir_src_is_divergent(&intrin->src[0])) - loop_if->divergent_discard = true; + loop_if->prev_terminate = instr->index; break; case nir_intrinsic_ddx: case nir_intrinsic_ddy: @@ -459,10 +524,12 @@ static bool move_coords_from_divergent_cf(struct move_tex_coords_state *state, default: break; } + } else if (instr->type == nir_instr_type_jump && block->divergent) { + loop_if->prev_break_continue = instr->index; } } - if (top_level && !loop_if->divergent_discard) + if (top_level && !loop_if->prev_terminate) state->toplevel_b.cursor = nir_after_block_before_jump(block); break; } @@ -472,7 +539,9 @@ static bool move_coords_from_divergent_cf(struct move_tex_coords_state *state, struct loop_if_state inner_else = *loop_if; progress |= move_coords_from_divergent_cf(state, &inner_then, &nif->then_list); progress |= move_coords_from_divergent_cf(state, &inner_else, &nif->else_list); - loop_if->divergent_discard |= inner_then.divergent_discard || inner_else.divergent_discard; + loop_if->prev_terminate = MAX2(inner_then.prev_terminate, inner_else.prev_terminate); + loop_if->prev_break_continue = + MAX2(inner_then.prev_break_continue, inner_else.prev_break_continue); break; } case nir_cf_node_loop: { @@ -481,7 +550,7 @@ static bool move_coords_from_divergent_cf(struct move_tex_coords_state *state, struct loop_if_state inner = *loop_if; inner.inside_loop = true; progress |= move_coords_from_divergent_cf(state, &inner, &loop->body); - loop_if->divergent_discard |= inner.divergent_discard; + loop_if->prev_terminate = inner.prev_terminate; break; } case nir_cf_node_function: @@ -496,9 +565,10 @@ bool ac_nir_lower_tex(nir_shader *nir, const ac_nir_lower_tex_options *options) { bool progress = false; - if (options->fix_derivs_in_divergent_cf) { + if (nir->info.stage == MESA_SHADER_FRAGMENT) { nir_function_impl *impl = nir_shader_get_entrypoint(nir); - nir_metadata_require(impl, nir_metadata_divergence); + nir_metadata_require( + impl, nir_metadata_divergence | nir_metadata_dominance | nir_metadata_instr_index); struct move_tex_coords_state state; state.toplevel_b = nir_builder_create(impl); @@ -507,7 +577,8 @@ ac_nir_lower_tex(nir_shader *nir, const ac_nir_lower_tex_options *options) struct loop_if_state loop_if; loop_if.inside_loop = false; - loop_if.divergent_discard = false; + loop_if.prev_terminate = 0; + loop_if.prev_break_continue = 0; bool impl_progress = move_coords_from_divergent_cf(&state, &loop_if, &impl->body); progress |= nir_progress(impl_progress, impl, nir_metadata_control_flow); }