broadcom/compiler: generate mali opcodes for clamping on Pi5

Models C0 and D0 support these opcodes too. total instructions in shared programs: 10869461 -> 10856992 (-0.11%) instructions in affected programs: 1467666 -> 1455197 (-0.85%) helped: 6012 HURT: 1413 Instructions are helped. total threads in shared programs: 431014 -> 431010 (<.01%) threads in affected programs: 8 -> 4 (-50.00%) helped: 0 HURT: 2 total uniforms in shared programs: 5432771 -> 5430909 (-0.03%) uniforms in affected programs: 183047 -> 181185 (-1.02%) helped: 976 HURT: 128 Uniforms are helped. total max-temps in shared programs: 2235272 -> 2234069 (-0.05%) max-temps in affected programs: 38163 -> 36960 (-3.15%) helped: 1262 HURT: 168 Max-temps are helped. total spills in shared programs: 4331 -> 4363 (0.74%) spills in affected programs: 964 -> 996 (3.32%) helped: 6 HURT: 47 total fills in shared programs: 6527 -> 6622 (1.46%) fills in affected programs: 2047 -> 2142 (4.64%) helped: 6 HURT: 47 total sfu-stalls in shared programs: 15807 -> 15935 (0.81%) sfu-stalls in affected programs: 787 -> 915 (16.26%) helped: 71 HURT: 172 Sfu-stalls are HURT. total inst-and-stalls in shared programs: 10885268 -> 10872927 (-0.11%) inst-and-stalls in affected programs: 1469423 -> 1457082 (-0.84%) helped: 5998 HURT: 1417 Inst-and-stalls are helped. total nops in shared programs: 184280 -> 185612 (0.72%) nops in affected programs: 10000 -> 11332 (13.32%) helped: 311 HURT: 1193 Nops are HURT. The results show a reduction in register pressure, but an increase in spills, which looks contradictory. This is because for some reason, this optimization makes the NIR scheduler produce code for some shaders in Godot that cause additional spilling, but the problem seems to be exclusive to Godot shaders and not really related to the optimization itself but to how the NIR scheduler works. Excluding Godot shaders we actually see a decrease in spills and a slightly larger improvement in instruction counts: total instructions in shared programs: 10720106 -> 10707621 (-0.12%) instructions in affected programs: 1375316 -> 1362831 (-0.91%) helped: 5948 HURT: 1364 Instructions are helped. total threads in shared programs: 428248 -> 428244 (<.01%) threads in affected programs: 8 -> 4 (-50.00%) helped: 0 HURT: 2 total spills in shared programs: 3729 -> 3712 (-0.46%) spills in affected programs: 451 -> 434 (-3.77%) helped: 6 HURT: 0 total fills in shared programs: 4738 -> 4714 (-0.51%) fills in affected programs: 564 -> 540 (-4.26%) helped: 6 HURT: 0 Comparing only shaders from Godot: total instructions in shared programs: 149355 -> 149371 (0.01%) instructions in affected programs: 92350 -> 92366 (0.02%) helped: 64 HURT: 49 Inconclusive result (value mean confidence interval includes 0). total max-temps in shared programs: 16477 -> 16472 (-0.03%) max-temps in affected programs: 180 -> 175 (-2.78%) helped: 5 HURT: 0 Max-temps are helped. total spills in shared programs: 602 -> 651 (8.14%) spills in affected programs: 513 -> 562 (9.55%) helped: 0 HURT: 47 total fills in shared programs: 1789 -> 1908 (6.65%) fills in affected programs: 1483 -> 1602 (8.02%) helped: 0 HURT: 47 Reviewed-by: Jose Maria Casanova Crespo <jmcasanova@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31480>
2024-09-30 12:34:03 +02:00
parent c57be33d96
commit c58bfb355a
3 changed files with 13 additions and 3 deletions
@@ -2245,7 +2245,7 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
        /* needs to be outside of optimization loop, otherwise it fights with
         * opt_algebraic optimizing the conversion lowering
         */
-        NIR_PASS(progress, s, v3d_nir_lower_algebraic);
+        NIR_PASS(progress, s, v3d_nir_lower_algebraic, c);
        NIR_PASS(progress, s, nir_opt_cse);

        nir_move_options sink_opts =
@@ -1209,7 +1209,7 @@ bool v3d_nir_lower_txf_ms(nir_shader *s);
 bool v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c);
 bool v3d_nir_lower_global_2x32(nir_shader *s);
 bool v3d_nir_lower_load_store_bitsize(nir_shader *s);
-bool v3d_nir_lower_algebraic(struct nir_shader *shader);
+bool v3d_nir_lower_algebraic(struct nir_shader *shader, const struct v3d_compile *c);

 void v3d_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr);
 void v3d_vir_emit_image_load_store(struct v3d_compile *c,
@@ -5,6 +5,9 @@ import sys

 a = 'a'

+has_unpack_sat = 'c && v3d_device_has_unpack_sat(c->devinfo)'
+has_unpack_max0 = 'c && v3d_device_has_unpack_max0(c->devinfo)'
+
 lower_alu = [
    (('f2i8', a), ('i2i8', ('f2i32', a))),
    (('f2i16', a), ('i2i16', ('f2i32', a))),
@@ -17,6 +20,10 @@ lower_alu = [

    (('u2f32', 'a@8'), ('u2f32', ('u2u32', a))),
    (('u2f32', 'a@16'), ('u2f32', ('u2u32', a))),
+
+    (('fmin', ('fmax', a, -1.0), 1.0), ('fsat_signed', a), has_unpack_sat),
+    (('fmax', ('fmin', a, 1.0), -1.0), ('fsat_signed', a), has_unpack_sat),
+    (('fmax', a, 0.0), ('fclamp_pos', a), has_unpack_max0),
 ]

 def main():
@@ -32,7 +39,10 @@ def run():
    print('#include "v3d_compiler.h"')

    print(nir_algebraic.AlgebraicPass("v3d_nir_lower_algebraic",
-                                      lower_alu).render())
+                                      lower_alu,
+                                      [
+                                          ("const struct v3d_compile *", "c")
+                                      ]).render())

 if __name__ == '__main__':
    main()