From 1b24612c570727a0c637159eeebbd88e79715435 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Wed, 28 Feb 2024 10:52:47 -0800
Subject: [PATCH] brw/nir: Treat load_*_uniform_block_intel as convergent

Between 5 and 10 shaders (depending on the platform) from Blender are
massively helped for spills and fills (e.g., from 45 spills to 0, and
180 fills to 0).

Previously this commit cause a lot of spill and fill damage to
Wolfenstein Youngblood and Red Dead Redemption 2. I believe due to
!32041 and !32097, this is no longer the case. RDR2 is helped, and
Wolfenstein Youngblood has no changes.

However, q2rtx/q2rtx-rt-pipeline is hurt:

    Spill count: 126 -> 175 (+38.89%); split: -0.79%, +39.68%
    Fill count: 156 -> 235 (+50.64%); split: -1.92%, +52.56%

By the end of this series this damage is fixed, and q2rtx is helped
overall by -0.79% spills and -1.92% fills.

v2: Fix for Xe2.

v3: Just keep using bld for the group(1, 0) call. Suggested by Ken.

v4: Major re-write. Pass bld and xbld to fs_emit_memory_access. The big
fix is changing the way srcs[MEMORY_LOGICAL_ADDRESS] is calculated
(around line 7180). In previous versions of the commit, the address
would be calculated using bld (which is now xbld) even if the address
source was not is_scalar. This could cause the emit_uniformize (later in
the function) to fetch garbage. This also drops the special case
handling of constant offset. Constant propagation and algebraic will
handle this.

v5: Fix a subtle bug that was ultimately caused by the removal of
offset_to_component. The MEMORY_LOGICAL_ADDRESS for
load_shared_uniform_block_intel was being calculated as SIMD16 on LNL,
but the later emit_uniformize would treat it as SIMD32. This caused GPU
hangs in Assassin's Creed Valhalla.

v6: Fix a bug in D16 to D16U32 expansion. Noticed by Ken. Add a comment
explaining bld vs xbld vs ubld in fs_nir_emit_memory_access. Suggested
by Ken.

v7: Revert some of the v6 changes related to D16 to D16U32
expansion. This code was mostly correct. xbld is correct because DATA0
needs to be generated in size of the eventual SEND instruction. Using
offset(nir_src, xbld, c) will cause offset() to correctly added
component(..., 0) if nir_src.is_scalar but xbld is not scalar_group().

v8: nir_intrinsic_load_shared_uniform_block_intel was removed. This
caused reproducible hangs in Assassin's Creed: Valhalla. There are some
other compiler issues related to this game, and we're not yet sure
exactly what the cause of any of it is.

shader-db:

Lunar Lake
total instructions in shared programs: 18058270 -> 18068886 (0.06%)
instructions in affected programs: 5196846 -> 5207462 (0.20%)
helped: 4442 / HURT: 11416

total cycles in shared programs: 921324492 -> 919819398 (-0.16%)
cycles in affected programs: 733274162 -> 731769068 (-0.21%)
helped: 11312 / HURT: 31788

total spills in shared programs: 3633 -> 3585 (-1.32%)
spills in affected programs: 48 -> 0
helped: 5 / HURT: 0

total fills in shared programs: 2277 -> 2198 (-3.47%)
fills in affected programs: 79 -> 0
helped: 5 / HURT: 0

LOST:   123
GAINED: 377

Meteor Lake, DG2, and Tiger Lake had similar results. (Meteor Lake shown)
total instructions in shared programs: 19703458 -> 19699173 (-0.02%)
instructions in affected programs: 5885251 -> 5880966 (-0.07%)
helped: 4545 / HURT: 14971

total cycles in shared programs: 903497253 -> 902054570 (-0.16%)
cycles in affected programs: 691762248 -> 690319565 (-0.21%)
helped: 16412 / HURT: 28080

total spills in shared programs: 4894 -> 4646 (-5.07%)
spills in affected programs: 248 -> 0
helped: 7 / HURT: 0

total fills in shared programs: 6638 -> 5581 (-15.92%)
fills in affected programs: 1057 -> 0
helped: 7 / HURT: 0

LOST:   427
GAINED: 978

Ice Lake and Skylake had similar results. (Ice Lake shonw)
total instructions in shared programs: 20384200 -> 20384889 (<.01%)
instructions in affected programs: 5295084 -> 5295773 (0.01%)
helped: 5309 / HURT: 12564

total cycles in shared programs: 873002832 -> 872515246 (-0.06%)
cycles in affected programs: 463413458 -> 462925872 (-0.11%)
helped: 16079 / HURT: 13339

total spills in shared programs: 4552 -> 4373 (-3.93%)
spills in affected programs: 546 -> 367 (-32.78%)
helped: 11 / HURT: 0

total fills in shared programs: 5298 -> 4657 (-12.10%)
fills in affected programs: 1798 -> 1157 (-35.65%)
helped: 10 / HURT: 0

LOST:   380
GAINED: 925

fossil-db:

All Intel platforms had similar results. (Lunar Lake shown)
Totals:
Instrs: 141528822 -> 141728392 (+0.14%); split: -0.21%, +0.35%
Subgroup size: 10968048 -> 10968144 (+0.00%)
Send messages: 6567930 -> 6567909 (-0.00%)
Cycle count: 22165780202 -> 21624534624 (-2.44%); split: -3.09%, +0.65%
Spill count: 69890 -> 66665 (-4.61%); split: -5.06%, +0.44%
Fill count: 128331 -> 120189 (-6.34%); split: -7.44%, +1.09%
Scratch Memory Size: 5829632 -> 5664768 (-2.83%); split: -2.86%, +0.04%
Max live registers: 47928290 -> 47611371 (-0.66%); split: -0.71%, +0.05%

Totals from 364369 (66.18% of 550563) affected shaders:
Instrs: 113448842 -> 113648412 (+0.18%); split: -0.26%, +0.44%
Subgroup size: 7694080 -> 7694176 (+0.00%)
Send messages: 5308287 -> 5308266 (-0.00%)
Cycle count: 21885237842 -> 21343992264 (-2.47%); split: -3.13%, +0.65%
Spill count: 65152 -> 61927 (-4.95%); split: -5.42%, +0.47%
Fill count: 122811 -> 114669 (-6.63%); split: -7.77%, +1.14%
Scratch Memory Size: 5438464 -> 5273600 (-3.03%); split: -3.07%, +0.04%
Max live registers: 34355310 -> 34038391 (-0.92%); split: -1.00%, +0.07%

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
---
 src/intel/compiler/brw_fs_nir.cpp | 82 ++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 28 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 99bf67abd3e..df73191f81e 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -83,6 +83,7 @@ static void fs_nir_emit_instr(nir_to_brw_state &ntb, nir_instr *instr);
 
 static void fs_nir_emit_memory_access(nir_to_brw_state &ntb,
                                       const fs_builder &bld,
+                                      const fs_builder &xbld,
                                       nir_intrinsic_instr *instr);
 
 static void brw_combine_with_vec(const fs_builder &bld, const brw_reg &dst,
@@ -1980,8 +1981,11 @@ get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform)
       case nir_intrinsic_load_btd_global_arg_addr_intel:
       case nir_intrinsic_load_btd_local_arg_addr_intel:
       case nir_intrinsic_load_btd_shader_type_intel:
+      case nir_intrinsic_load_global_constant_uniform_block_intel:
       case nir_intrinsic_load_inline_data_intel:
       case nir_intrinsic_load_reloc_const_intel:
+      case nir_intrinsic_load_ssbo_uniform_block_intel:
+      case nir_intrinsic_load_ubo_uniform_block_intel:
       case nir_intrinsic_load_workgroup_id:
          is_scalar = true;
          break;
@@ -5033,18 +5037,7 @@ try_rebuild_source(nir_to_brw_state &ntb, const brw::fs_builder &bld,
 
          case nir_intrinsic_load_ubo_uniform_block_intel:
          case nir_intrinsic_load_ssbo_uniform_block_intel: {
-            enum brw_reg_type type =
-               brw_type_with_size(BRW_TYPE_D, intrin->def.bit_size);
-            brw_reg src_data = retype(ntb.ssa_values[def->index], type);
-            unsigned n_components = ntb.s.alloc.sizes[src_data.nr] /
-                                    (bld.dispatch_width() / 8);
-            brw_reg dst_data = ubld.vgrf(type, n_components);
-            ntb.resource_insts[def->index] = ubld.MOV(dst_data, src_data);
-            for (unsigned c = 1; c < n_components; c++) {
-               ubld.MOV(offset(dst_data, ubld, c),
-                        offset(src_data, bld, c));
-            }
-            break;
+            unreachable("load_{ubo,ssbo}_uniform_block_intel should already be is_scalar");
          }
 
          default:
@@ -6079,7 +6072,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
    case nir_intrinsic_global_atomic_swap:
    case nir_intrinsic_load_scratch:
    case nir_intrinsic_store_scratch:
-      fs_nir_emit_memory_access(ntb, bld, instr);
+      fs_nir_emit_memory_access(ntb, bld, xbld, instr);
       break;
 
    case nir_intrinsic_image_size:
@@ -6489,7 +6482,7 @@ fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
             }
          } else {
             /* load_ubo_uniform_block_intel with non-constant offset */
-            fs_nir_emit_memory_access(ntb, bld, instr);
+            fs_nir_emit_memory_access(ntb, bld, xbld, instr);
          }
       } else {
          /* Even if we are loading doubles, a pull constant load will load
@@ -7082,9 +7075,23 @@ lsc_bits_to_data_size(unsigned bit_size)
    }
 }
 
+/**
+ *
+ * \param bld  "Normal" builder. This is the full dispatch width of the shader.
+ *
+ * \param xbld Builder for the intrinsic. If the intrinsic is convergent, this
+ *             builder will be scalar_group(). Otherwise it will be the same
+ *             as bld.
+ *
+ * Some places in the function will also use \c ubld. There are two cases of
+ * this. Sometimes it is to generate intermediate values as SIMD1. Other
+ * places that use \c ubld need a scalar_group() builder to operate on sources
+ * to the intrinsic that are is_scalar.
+ */
 static void
 fs_nir_emit_memory_access(nir_to_brw_state &ntb,
                           const fs_builder &bld,
+                          const fs_builder &xbld,
                           nir_intrinsic_instr *instr)
 {
    const intel_device_info *devinfo = ntb.devinfo;
@@ -7168,12 +7175,24 @@ fs_nir_emit_memory_access(nir_to_brw_state &ntb,
       srcs[MEMORY_LOGICAL_MODE] = brw_imm_ud(MEMORY_MODE_SHARED_LOCAL);
       srcs[MEMORY_LOGICAL_BINDING_TYPE] = brw_imm_ud(LSC_ADDR_SURFTYPE_FLAT);
 
-      const nir_src &nir_src = instr->src[is_store ? 1 : 0];
+      const brw_reg nir_src = get_nir_src(ntb, instr->src[is_store ? 1 : 0]);
+      const fs_builder ubld = nir_src.is_scalar ? bld.scalar_group() : bld;
 
-      srcs[MEMORY_LOGICAL_ADDRESS] = nir_src_is_const(nir_src) ?
-         brw_imm_ud(nir_intrinsic_base(instr) + nir_src_as_uint(nir_src)) :
-         bld.ADD(retype(get_nir_src(ntb, nir_src), BRW_TYPE_UD),
-                 brw_imm_ud(nir_intrinsic_base(instr)));
+      /* If the logical address is not uniform, a call to emit_uniformize
+       * below will fix it up.
+       */
+      srcs[MEMORY_LOGICAL_ADDRESS] =
+         ubld.ADD(retype(nir_src, BRW_TYPE_UD),
+                  brw_imm_ud(nir_intrinsic_base(instr)));
+
+      /* If nir_src is_scalar, the MEMORY_LOGICAL_ADDRESS will be allocated at
+       * scalar_group() size and will have every component the same
+       * value. This is the definition of is_scalar. Much more importantly,
+       * setting is_scalar properly also ensures that emit_uniformize (below)
+       * will handle the value as scalar_group() size instead of full dispatch
+       * width.
+       */
+      srcs[MEMORY_LOGICAL_ADDRESS].is_scalar = nir_src.is_scalar;
 
       data_src = is_atomic ? 1 : 0;
       no_mask_handle = true;
@@ -7195,6 +7214,9 @@ fs_nir_emit_memory_access(nir_to_brw_state &ntb,
          if (devinfo->ver >= 20)
             bind = component(ubld.SHR(bind, brw_imm_ud(4)), 0);
 
+         /* load_scratch / store_scratch cannot be is_scalar yet. */
+         assert(xbld.dispatch_width() == bld.dispatch_width());
+
          srcs[MEMORY_LOGICAL_BINDING] = bind;
          srcs[MEMORY_LOGICAL_ADDRESS] =
             swizzle_nir_scratch_addr(ntb, bld, addr, false);
@@ -7202,6 +7224,10 @@ fs_nir_emit_memory_access(nir_to_brw_state &ntb,
          unsigned bit_size =
             is_store ? nir_src_bit_size(instr->src[0]) : instr->def.bit_size;
          bool dword_aligned = align >= 4 && bit_size == 32;
+
+         /* load_scratch / store_scratch cannot be is_scalar yet. */
+         assert(xbld.dispatch_width() == bld.dispatch_width());
+
          srcs[MEMORY_LOGICAL_BINDING_TYPE] =
             brw_imm_ud(LSC_ADDR_SURFTYPE_FLAT);
          srcs[MEMORY_LOGICAL_ADDRESS] =
@@ -7266,10 +7292,10 @@ fs_nir_emit_memory_access(nir_to_brw_state &ntb,
 
          if (data_bit_size > nir_bit_size) {
             /* Expand e.g. D16 to D16U32 */
-            srcs[MEMORY_LOGICAL_DATA0 + i] = bld.vgrf(data_type, components);
+            srcs[MEMORY_LOGICAL_DATA0 + i] = xbld.vgrf(data_type, components);
             for (unsigned c = 0; c < components; c++) {
-               bld.MOV(offset(srcs[MEMORY_LOGICAL_DATA0 + i], bld, c),
-                       offset(nir_src, bld, c));
+               xbld.MOV(offset(srcs[MEMORY_LOGICAL_DATA0 + i], xbld, c),
+                        offset(nir_src, xbld, c));
             }
          } else {
             srcs[MEMORY_LOGICAL_DATA0 + i] = nir_src;
@@ -7280,7 +7306,7 @@ fs_nir_emit_memory_access(nir_to_brw_state &ntb,
    brw_reg dest, nir_dest;
    if (!is_store) {
       nir_dest = retype(get_nir_def(ntb, instr->def), nir_data_type);
-      dest = data_bit_size > nir_bit_size ? bld.vgrf(data_type, components)
+      dest = data_bit_size > nir_bit_size ? xbld.vgrf(data_type, components)
                                           : nir_dest;
    }
 
@@ -7304,14 +7330,14 @@ fs_nir_emit_memory_access(nir_to_brw_state &ntb,
    fs_inst *inst;
 
    if (!block) {
-      inst = bld.emit(opcode, dest, srcs, MEMORY_LOGICAL_NUM_SRCS);
+      inst = xbld.emit(opcode, dest, srcs, MEMORY_LOGICAL_NUM_SRCS);
       inst->size_written *= components;
 
       if (dest.file != BAD_FILE && data_bit_size > nir_bit_size) {
          /* Shrink e.g. D16U32 result back to D16 */
          for (unsigned i = 0; i < components; i++) {
-            bld.MOV(offset(nir_dest, bld, i),
-                    subscript(offset(dest, bld, i), nir_dest.type, 0));
+            xbld.MOV(offset(nir_dest, xbld, i),
+                     subscript(offset(dest, xbld, i), nir_dest.type, 0));
          }
       }
    } else {
@@ -7371,8 +7397,8 @@ fs_nir_emit_memory_access(nir_to_brw_state &ntb,
 
       if (convergent_block_load) {
          for (unsigned c = 0; c < components; c++) {
-            bld.MOV(retype(offset(nir_dest, bld, c), BRW_TYPE_UD),
-                    component(dest, c));
+            xbld.MOV(retype(offset(nir_dest, xbld, c), BRW_TYPE_UD),
+                     component(dest, c));
          }
       }
    }