nir/range_analysis: Handle bfi and bitfield_select in get_alu_uub
I noticed some things related to this while implementing support for bitfield_select / BFN in BRW. shader-db: Lunar Lake total instructions in shared programs: 17183140 -> 17183128 (<.01%) instructions in affected programs: 3830 -> 3818 (-0.31%) helped: 6 / HURT: 0 total cycles in shared programs: 889936934 -> 889936056 (<.01%) cycles in affected programs: 253758 -> 252880 (-0.35%) helped: 4 / HURT: 2 No shader-db changes on any other Intel platform. fossil-db: Lunar Lake Totals: Instrs: 233285343 -> 233284796 (-0.00%); split: -0.00%, +0.00% Cycle count: 32756777978 -> 32756399804 (-0.00%); split: -0.00%, +0.00% Max live registers: 71738646 -> 71738626 (-0.00%) Non SSA regs after NIR: 67837900 -> 67837902 (+0.00%) Totals from 177 (0.02% of 790723) affected shaders: Instrs: 389849 -> 389302 (-0.14%); split: -0.14%, +0.00% Cycle count: 356341872 -> 355963698 (-0.11%); split: -0.11%, +0.01% Max live registers: 39364 -> 39344 (-0.05%) Non SSA regs after NIR: 70453 -> 70455 (+0.00%) Meteor Lake, DG2, and Ice Lake had similar results. (Meteor Lake shown) Totals: Instrs: 264095611 -> 264095358 (-0.00%) Cycle count: 26555705299 -> 26554303407 (-0.01%); split: -0.01%, +0.00% Fill count: 613233 -> 613231 (-0.00%) Totals from 123 (0.01% of 905547) affected shaders: Instrs: 334830 -> 334577 (-0.08%) Cycle count: 326531667 -> 325129775 (-0.43%); split: -0.65%, +0.22% Fill count: 4145 -> 4143 (-0.05%) Tiger Lake and Skylake had similar results. (Tiger Lake shown) Totals: Instrs: 269733849 -> 269733590 (-0.00%) Cycle count: 25240548036 -> 25241435039 (+0.00%); split: -0.00%, +0.01% Totals from 123 (0.01% of 903812) affected shaders: Instrs: 338617 -> 338358 (-0.08%) Cycle count: 326605644 -> 327492647 (+0.27%); split: -0.13%, +0.40% Reviewed-by: Matt Turner <mattst88@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37186>
This commit is contained in:
@@ -1837,7 +1837,9 @@ get_alu_uub(struct analysis_state *state, struct scalar_query q, uint32_t *resul
|
||||
case nir_op_bcsel:
|
||||
case nir_op_b32csel:
|
||||
case nir_op_ubfe:
|
||||
case nir_op_bfi:
|
||||
case nir_op_bfm:
|
||||
case nir_op_bitfield_select:
|
||||
case nir_op_extract_u8:
|
||||
case nir_op_extract_i8:
|
||||
case nir_op_extract_u16:
|
||||
@@ -1999,6 +2001,57 @@ get_alu_uub(struct analysis_state *state, struct scalar_query q, uint32_t *resul
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_op_bfi: {
|
||||
nir_scalar src0_scalar = nir_scalar_chase_alu_src(q.scalar, 0);
|
||||
const uint64_t s1 = bitmask(util_last_bit64(src[1]));
|
||||
const uint64_t s2 = bitmask(util_last_bit64(src[2]));
|
||||
|
||||
if (nir_scalar_is_const(src0_scalar)) {
|
||||
const uint64_t s0 = nir_scalar_as_uint(src0_scalar);
|
||||
|
||||
/* This case should be eliminated by opt_algebraic. */
|
||||
if (s0 == 0) {
|
||||
*result = s2;
|
||||
} else {
|
||||
const int x = ffsll(s0) - 1;
|
||||
*result = (s0 & (s1 << x)) | (~s0 & s2);
|
||||
}
|
||||
} else {
|
||||
const uint64_t s0 = bitmask(util_last_bit64(src[0]));
|
||||
|
||||
/* Due to the unpredictable shift, the true maximum value of (s0 &
|
||||
* (s1 << x)) cannot be known. However, it cannot be larger than
|
||||
* s0.
|
||||
*
|
||||
* inot doesn't work in get_alu_uub. It is known that (~s0 & s2)
|
||||
* cannot be larger than s2, so just use s2 as a loose upper bound.
|
||||
*/
|
||||
*result = s0 | s2;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_op_bitfield_select: {
|
||||
nir_scalar src0_scalar = nir_scalar_chase_alu_src(q.scalar, 0);
|
||||
const uint64_t s1 = bitmask(util_last_bit64(src[1]));
|
||||
const uint64_t s2 = bitmask(util_last_bit64(src[2]));
|
||||
|
||||
if (nir_scalar_is_const(src0_scalar)) {
|
||||
const uint64_t s0 = nir_scalar_as_uint(src0_scalar);
|
||||
|
||||
*result = (s0 & s1) | (~s0 & s2);
|
||||
} else {
|
||||
const uint64_t s0 = bitmask(util_last_bit64(src[0]));
|
||||
|
||||
/* inot doesn't work in get_alu_uub. It is known that (~s0 & s2)
|
||||
* cannot be larger than s2, so just use s2 as a loose upper bound.
|
||||
*/
|
||||
*result = (s0 & s1) | s2;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
/* limited floating-point support for f2u32(fmul(load_input(), <constant>)) */
|
||||
case nir_op_f2i32:
|
||||
case nir_op_f2u32:
|
||||
|
||||
Reference in New Issue
Block a user