From f7939f2fdc200259c8af3380854ac16f7360b28d Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Mon, 16 Jun 2025 11:38:53 -0700 Subject: [PATCH] nir/range_analysis: Handle bfi and bitfield_select in get_alu_uub I noticed some things related to this while implementing support for bitfield_select / BFN in BRW. shader-db: Lunar Lake total instructions in shared programs: 17183140 -> 17183128 (<.01%) instructions in affected programs: 3830 -> 3818 (-0.31%) helped: 6 / HURT: 0 total cycles in shared programs: 889936934 -> 889936056 (<.01%) cycles in affected programs: 253758 -> 252880 (-0.35%) helped: 4 / HURT: 2 No shader-db changes on any other Intel platform. fossil-db: Lunar Lake Totals: Instrs: 233285343 -> 233284796 (-0.00%); split: -0.00%, +0.00% Cycle count: 32756777978 -> 32756399804 (-0.00%); split: -0.00%, +0.00% Max live registers: 71738646 -> 71738626 (-0.00%) Non SSA regs after NIR: 67837900 -> 67837902 (+0.00%) Totals from 177 (0.02% of 790723) affected shaders: Instrs: 389849 -> 389302 (-0.14%); split: -0.14%, +0.00% Cycle count: 356341872 -> 355963698 (-0.11%); split: -0.11%, +0.01% Max live registers: 39364 -> 39344 (-0.05%) Non SSA regs after NIR: 70453 -> 70455 (+0.00%) Meteor Lake, DG2, and Ice Lake had similar results. (Meteor Lake shown) Totals: Instrs: 264095611 -> 264095358 (-0.00%) Cycle count: 26555705299 -> 26554303407 (-0.01%); split: -0.01%, +0.00% Fill count: 613233 -> 613231 (-0.00%) Totals from 123 (0.01% of 905547) affected shaders: Instrs: 334830 -> 334577 (-0.08%) Cycle count: 326531667 -> 325129775 (-0.43%); split: -0.65%, +0.22% Fill count: 4145 -> 4143 (-0.05%) Tiger Lake and Skylake had similar results. (Tiger Lake shown) Totals: Instrs: 269733849 -> 269733590 (-0.00%) Cycle count: 25240548036 -> 25241435039 (+0.00%); split: -0.00%, +0.01% Totals from 123 (0.01% of 903812) affected shaders: Instrs: 338617 -> 338358 (-0.08%) Cycle count: 326605644 -> 327492647 (+0.27%); split: -0.13%, +0.40% Reviewed-by: Matt Turner Reviewed-by: Georg Lehmann Part-of: --- src/compiler/nir/nir_range_analysis.c | 53 +++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/src/compiler/nir/nir_range_analysis.c b/src/compiler/nir/nir_range_analysis.c index 0d65ecab8cf..c7690a0a942 100644 --- a/src/compiler/nir/nir_range_analysis.c +++ b/src/compiler/nir/nir_range_analysis.c @@ -1837,7 +1837,9 @@ get_alu_uub(struct analysis_state *state, struct scalar_query q, uint32_t *resul case nir_op_bcsel: case nir_op_b32csel: case nir_op_ubfe: + case nir_op_bfi: case nir_op_bfm: + case nir_op_bitfield_select: case nir_op_extract_u8: case nir_op_extract_i8: case nir_op_extract_u16: @@ -1999,6 +2001,57 @@ get_alu_uub(struct analysis_state *state, struct scalar_query q, uint32_t *resul } break; } + + case nir_op_bfi: { + nir_scalar src0_scalar = nir_scalar_chase_alu_src(q.scalar, 0); + const uint64_t s1 = bitmask(util_last_bit64(src[1])); + const uint64_t s2 = bitmask(util_last_bit64(src[2])); + + if (nir_scalar_is_const(src0_scalar)) { + const uint64_t s0 = nir_scalar_as_uint(src0_scalar); + + /* This case should be eliminated by opt_algebraic. */ + if (s0 == 0) { + *result = s2; + } else { + const int x = ffsll(s0) - 1; + *result = (s0 & (s1 << x)) | (~s0 & s2); + } + } else { + const uint64_t s0 = bitmask(util_last_bit64(src[0])); + + /* Due to the unpredictable shift, the true maximum value of (s0 & + * (s1 << x)) cannot be known. However, it cannot be larger than + * s0. + * + * inot doesn't work in get_alu_uub. It is known that (~s0 & s2) + * cannot be larger than s2, so just use s2 as a loose upper bound. + */ + *result = s0 | s2; + } + break; + } + + case nir_op_bitfield_select: { + nir_scalar src0_scalar = nir_scalar_chase_alu_src(q.scalar, 0); + const uint64_t s1 = bitmask(util_last_bit64(src[1])); + const uint64_t s2 = bitmask(util_last_bit64(src[2])); + + if (nir_scalar_is_const(src0_scalar)) { + const uint64_t s0 = nir_scalar_as_uint(src0_scalar); + + *result = (s0 & s1) | (~s0 & s2); + } else { + const uint64_t s0 = bitmask(util_last_bit64(src[0])); + + /* inot doesn't work in get_alu_uub. It is known that (~s0 & s2) + * cannot be larger than s2, so just use s2 as a loose upper bound. + */ + *result = (s0 & s1) | s2; + } + break; + } + /* limited floating-point support for f2u32(fmul(load_input(), )) */ case nir_op_f2i32: case nir_op_f2u32: