From 08ec408061e90857fef2eb58ee936a7afc5cdc60 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Wed, 1 Oct 2025 09:18:27 -0700 Subject: [PATCH] nir/algebraic: Optimize f2u of negative value to zero The eliminated SENDs are from a single app that has a bunch of fragment shaders with a sequence like: con 32 %495 = fmul! %203.i, %1 (0.000000) con 32 %496 = ffma! %203.j, %1 (0.000000), %495 con 32 %497 = ffma! %203.k, %1 (0.000000), %496 con 32 %498 = ffma! %203.l, %1 (0.000000), %497 con 32 %499 = @load_reloc_const_intel (param_idx=1, base=0) con 32 %500 = @load_reloc_const_intel (param_idx=0, base=0) con 32 %501 = f2u32 %498 con 32 %502 = umin %501, %172 (0x4) con 32 %503 = ishl %502, %172 (0x4) con 32 %504 = load_const (0x00000040 = 64) con 32 %505 = umin %503, %504 (0x40) con 32 %506 = iadd %500, %505 The `f2u` is replaced with 0, and that makes the `ffma` dot-product sequence be unused. Since it is unused, most of the preceeding block gets eliminated. A lot of instructions after the `f2u` are also eliminated by other algebraic optimizations. Most importantly, %203 is the result of a `load_ubo_uniform_block_intel` that is eliminated. No shader-db changes on any Intel platform. fossil-db: All Intel platforms had similar results. (Lunar Lake shown) Totals: Instrs: 919895603 -> 919804051 (-0.01%); split: -0.01%, +0.00% Send messages: 40892036 -> 40887569 (-0.01%) Cycle count: 99176770712 -> 99174971806 (-0.00%); split: -0.00%, +0.00% Max live registers: 190030365 -> 190030367 (+0.00%) Max dispatch width: 47415040 -> 47415024 (-0.00%) Non SSA regs after NIR: 228872538 -> 228863608 (-0.00%); split: -0.00%, +0.00% Totals from 2234 (0.11% of 1955134) affected shaders: Instrs: 1989743 -> 1898191 (-4.60%); split: -4.60%, +0.00% Send messages: 44179 -> 39712 (-10.11%) Cycle count: 25416114 -> 23617208 (-7.08%); split: -7.08%, +0.00% Max live registers: 367357 -> 367359 (+0.00%) Max dispatch width: 39184 -> 39168 (-0.04%) Non SSA regs after NIR: 471173 -> 462243 (-1.90%); split: -1.90%, +0.00% Reviewed-by: Georg Lehmann Part-of: --- src/compiler/nir/nir_opt_algebraic.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 4f9ea6d4718..6219af612eb 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -1879,6 +1879,16 @@ optimizations.extend([ (('f2i', ('ffloor', 'a(is_not_negative)')), ('f2i', a)), (('f2u', ('ffloor', a)), ('f2u', a)), + # Section 3.3.11 (Conversion Instructions) of the SPIR-V 1.6 spec says: + # + # "Behavior is undefined if Result Type is not wide enough to hold the + # converted value." + # + # Unsigned integers cannot hold negative values, so squash them to + # zero. This is what the conversion instruction on many GPUs would do + # anyway. + (('f2u', 'a(is_not_positive)'), 0), + # Conversions from 16 bits to 32 bits and back can always be removed (('f2fmp', ('f2f32', 'a@16')), a), (('i2imp', ('i2i32', 'a@16')), a),