Files
mesa/src/intel/compiler/intel_nir_blockify_uniform_loads.c
T
Alyssa Rosenzweig 82ae8b1d33 treewide: simplify nir_def_rewrite_uses_after
Most of the time with nir_def_rewrite_uses_after, you want to rewrite after the
replacement. Make that the default thing to be more ergonomic and to drop
parent_instr uses.

We leave nir_def_rewrite_uses_after_instr defined if you really want the old
signature with an arbitrary after point.

Via Coccinelle patch:

    @@
    expression a, b;
    @@

    -nir_def_rewrite_uses_after(a, b, b->parent_instr)
    +nir_def_rewrite_uses_after_def(a, b)

Followed by a bunch of sed.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Emma Anholt <emma@anholt.net>
Reviewed-by: Marek Olšák <maraeo@gmail.com>
Acked-by: Karol Herbst <kherbst@redhat.com>
Acked-by: Konstantin Seurer <konstantin.seurer@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36489>
2025-08-01 15:34:24 +00:00

277 lines
10 KiB
C

/*
* Copyright © 2018 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "dev/intel_device_info.h"
#include "intel_nir.h"
#include "isl/isl.h"
#include "nir_builder.h"
static bool
rebase_const_offset_ubo_loads_instr(nir_builder *b,
nir_instr *instr,
void *cb_data)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_ubo_uniform_block_intel)
return false;
if (!nir_src_is_const(intrin->src[1]))
return false;
const unsigned type_bytes = intrin->def.bit_size / 8;
const unsigned cacheline_bytes = 64;
const unsigned block_components =
MIN2(cacheline_bytes / type_bytes, NIR_MAX_VEC_COMPONENTS);
const unsigned orig_offset = nir_src_as_uint(intrin->src[1]);
const unsigned new_offset = ROUND_DOWN_TO(orig_offset, cacheline_bytes);
const unsigned orig_def_components = intrin->def.num_components;
const unsigned orig_read_components =
nir_def_last_component_read(&intrin->def) + 1;
const unsigned pad_components = (orig_offset - new_offset) / type_bytes;
/* Don't round down if we'd have to split a single load into two loads */
if (orig_read_components + pad_components > block_components)
return false;
/* Always read a full block so we can CSE reads of different sizes.
* The backend will skip reading unused trailing components anyway.
*/
intrin->def.num_components = block_components;
intrin->num_components = block_components;
nir_intrinsic_set_range(intrin, block_components * type_bytes);
nir_intrinsic_set_align_offset(intrin, 0);
/* We're running this pass before the constant offset extraction, so it
* should be 0 at this point, otherwise some other pass modified this value
* and likely didn't teak into account our HW requirements.
*/
assert(nir_intrinsic_base(intrin) == 0);
if (pad_components) {
/* Change the base of the load to the new lower offset, and emit
* moves to read from the now higher vector component locations.
*/
b->cursor = nir_before_instr(instr);
nir_src_rewrite(&intrin->src[1], nir_imm_int(b, new_offset));
}
b->cursor = nir_after_instr(instr);
nir_scalar components[NIR_MAX_VEC_COMPONENTS];
nir_scalar undef = nir_get_scalar(nir_undef(b, 1, type_bytes * 8), 0);
unsigned i = 0;
for (; i < orig_read_components; i++)
components[i] = nir_get_scalar(&intrin->def, pad_components + i);
for (; i < orig_def_components; i++)
components[i] = undef;
nir_def *rebase = nir_vec_scalars(b, components, orig_def_components);
rebase->divergent = false;
nir_def_rewrite_uses_after(&intrin->def, rebase);
return true;
}
/**
* Shaders commonly contain small UBO loads with a constant offset scattered
* throughout the program. Ideally, we want to vectorize those into larger
* block loads so we can load whole cachelines at a time, or at least fill
* whole 32B registers rather than having empty space.
*
* nir_opt_load_store_vectorize() is terrific for combining small loads into
* nice large block loads. Unfortunately, it only vectorizes within a single
* basic block, and there's a lot of opportunity for optimizing globally.
*
* In the past, our backend loaded whole 64B cachelines at a time (on pre-Xe2,
* two registers) and rounded down constant UBO load offsets to the nearest
* multiple of 64B. This meant multiple loads within the same 64B would be
* CSE'd into the same load, and we could even take advantage of global CSE.
* However, we didn't have a method for shrinking loads from 64B back to 32B
* again, and also didn't have a lot of flexibility in how this interacted
* with the NIR load/store vectorization.
*
* This pass takes a similar approach, but in NIR. The idea is to:
*
* 1. Run load/store vectorization to combine access within a basic block
*
* 2. Find load_ubo_uniform_block_intel intrinsics with constant offsets.
* Round their base down to the nearest multiple of 64B, and also increase
* their returned vector to be a vec16 (64B for 32-bit values). However,
* only do this if a single vec16 load would cover this additional "pad"
* space at the front, and all used components of the existing load. That
* way, we don't blindly turn a single load into two loads.
*
* If we made any progress, then...
*
* 3. Run global CSE. This will coalesce any accesses to the same 64B
* region across subtrees of the CFG.
*
* 4. Run the load/store vectorizer again for UBOs. This will clean up
* any overlapping memory access within a block.
*
* 5. Have the backend only issue loads for components of the vec16 which
* are actually read. We could also shrink this in NIR, but doing it in
* the backend is pretty straightforward.
*
* We could probably do better with a fancier sliding-window type pass
* which looked across blocks to produce optimal loads. However, this
* simple hack using existing passes does a fairly good job for now.
*/
bool
brw_nir_rebase_const_offset_ubo_loads(nir_shader *shader)
{
return nir_shader_instructions_pass(shader,
rebase_const_offset_ubo_loads_instr,
nir_metadata_control_flow |
nir_metadata_live_defs,
NULL);
}
static bool
intel_nir_blockify_uniform_loads_instr(nir_builder *b,
nir_instr *instr,
void *cb_data)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
const struct intel_device_info *devinfo = cb_data;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_ssbo:
/* BDW PRMs, Volume 7: 3D-Media-GPGPU: OWord Block ReadWrite:
*
* "The surface base address must be OWord-aligned."
*
* We can't make that guarantee with SSBOs where the alignment is
* 4bytes.
*/
if (devinfo->ver < 9)
return false;
if (nir_src_is_divergent(&intrin->src[1]))
return false;
if (intrin->def.bit_size != 32)
return false;
/* Without the LSC, we can only do block loads of at least 4dwords (1
* oword).
*/
if (!devinfo->has_lsc && intrin->def.num_components < 4)
return false;
b->cursor = nir_before_instr(&intrin->instr);
nir_def *new_value =
intrin->intrinsic == nir_intrinsic_load_ubo ?
nir_load_ubo_uniform_block_intel(
b,
intrin->def.num_components,
intrin->def.bit_size,
intrin->src[0].ssa,
intrin->src[1].ssa,
.access = nir_intrinsic_access(intrin),
.align_mul = nir_intrinsic_align_mul(intrin),
.align_offset = nir_intrinsic_align_offset(intrin),
.base = 0,
.range = nir_intrinsic_range(intrin)) :
nir_load_ssbo_uniform_block_intel(
b,
intrin->def.num_components,
intrin->def.bit_size,
intrin->src[0].ssa,
intrin->src[1].ssa,
.access = nir_intrinsic_access(intrin),
.align_mul = nir_intrinsic_align_mul(intrin),
.align_offset = nir_intrinsic_align_offset(intrin),
.base = 0);
new_value->loop_invariant = intrin->def.loop_invariant;
new_value->divergent = false;
nir_def_replace(&intrin->def, new_value);
return true;
case nir_intrinsic_load_shared:
/* Block loads on shared memory are not supported before Icelake. */
if (devinfo->ver < 11)
return false;
if (nir_src_is_divergent(&intrin->src[0]))
return false;
if (intrin->def.bit_size != 32)
return false;
/* Without the LSC, we have to use OWord Block Load messages (the one
* that requires OWord aligned offsets, too).
*/
if (!devinfo->has_lsc &&
(intrin->def.num_components < 4 ||
nir_intrinsic_align(intrin) < 16))
return false;
intrin->intrinsic = nir_intrinsic_load_shared_uniform_block_intel;
return true;
case nir_intrinsic_load_global_constant:
if (nir_src_is_divergent(&intrin->src[0]))
return false;
if (intrin->def.bit_size != 32)
return false;
/* Without the LSC, we can only do block loads of at least 4dwords (1
* oword).
*/
if (!devinfo->has_lsc && intrin->def.num_components < 4)
return false;
intrin->intrinsic = nir_intrinsic_load_global_constant_uniform_block_intel;
return true;
default:
return false;
}
}
bool
intel_nir_blockify_uniform_loads(nir_shader *shader,
const struct intel_device_info *devinfo)
{
nir_divergence_analysis(shader);
return nir_shader_instructions_pass(shader,
intel_nir_blockify_uniform_loads_instr,
nir_metadata_control_flow,
(void *) devinfo);
}