nir,nak: Add KeplerB shared atomics intrinsics and lowering

Kepler cards do not support shared atomic operations directly, but they
have special ldslk and stsul that can implement mutex locks on
addresses. Shared atomics can be lowered into operations in mutexes.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35028>
This commit is contained in:
Lorenzo Rossi
2025-05-19 20:58:41 +02:00
committed by Marge Bot
parent 88e449dc85
commit 47f6c74b71
6 changed files with 138 additions and 1 deletions
@@ -926,6 +926,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_load_sample_mask:
case nir_intrinsic_quad_ballot_agx:
case nir_intrinsic_load_agx:
case nir_intrinsic_load_shared_lock_nv:
case nir_intrinsic_store_shared_unlock_nv:
is_divergent = true;
break;
+4
View File
@@ -2485,6 +2485,10 @@ intrinsic("ldtram_nv", dest_comp=2, bit_sizes=[32],
# NVIDIA-specific Image intrinsics
image("load_raw_nv", src_comp=[4, 1, 1], extra_indices=[DEST_TYPE], dest_comp=0, flags=[CAN_ELIMINATE])
# Nvidia Kepler specific load-lock store-unlock
# used to lower shared atomics.
intrinsic("load_shared_lock_nv", src_comp=[1], dest_comp=2)
intrinsic("store_shared_unlock_nv", src_comp=[1, 1], dest_comp=1)
# NVIDIA-specific Geometry Shader intrinsics.
# These contain an additional integer source and destination with the primitive handle input/output.
+1
View File
@@ -35,6 +35,7 @@ libnak_c_files = files(
'nak_nir_lower_gs_intrinsics.c',
'nak_nir_lower_non_uniform_ldcx.c',
'nak_nir_lower_scan_reduce.c',
'nak_nir_lower_kepler_shared_atomics.c',
'nak_nir_lower_tex.c',
'nak_nir_lower_vtg_io.c',
'nak_nir_mark_lcssa_invariants.c',
+8 -1
View File
@@ -958,7 +958,14 @@ nak_postprocess_nir(nir_shader *nir,
.lower_rotate_to_shuffle = true
};
OPT(nir, nir_lower_subgroups, &subgroups_options);
OPT(nir, nir_lower_atomics, atomic_supported);
if (nak->sm >= 50) {
// On Maxwell+ we need to lower shared 64-bit atomics into
// compare-and-swap loops
OPT(nir, nir_lower_atomics, atomic_supported);
} else {
// On Kepler we need to lower shared atomics into locked ld-st
OPT(nir, nak_nir_lower_kepler_shared_atomics);
}
OPT(nir, nak_nir_lower_scan_reduce);
if (nir_shader_has_local_variables(nir)) {
@@ -0,0 +1,122 @@
/*
* Copyright © 2025 Lorenzo Rossi
* SPDX-License-Identifier: MIT
*/
#include "nak_private.h"
#include "nir.h"
#include "nir_builder.h"
/*
* Convert atomic arithmetic to regular arithmetic along with mutex locks.
*
* eg:
* atomicAdd(addr, 1) ->
*
* uint expected = a[0];
* bool success = false;
* do {
* data, is_locked = load_locked(a[0])
* if (is_locked) {
* data = data + 1;
* success = store_and_unlock(&a[0], data);
* }
* } while (!success);
*
* special_case cmp_exc and exc.
*/
static nir_def *
lower_atomic_in_lock(nir_builder *b, nir_intrinsic_instr *intr, nir_def *loaded)
{
// Assume we have the lock, the previous value is in loaded and we must
// compute the value to store in the address.
// to_store = op(loaded, data)
nir_def *data = intr->src[1].ssa;
nir_def *to_store;
switch (nir_intrinsic_atomic_op(intr)) {
case nir_atomic_op_imin:
case nir_atomic_op_umin:
case nir_atomic_op_imax:
case nir_atomic_op_umax:
case nir_atomic_op_iand:
case nir_atomic_op_ior:
case nir_atomic_op_ixor:
case nir_atomic_op_fadd:
case nir_atomic_op_fmin:
case nir_atomic_op_fmax:
case nir_atomic_op_iadd: {
to_store = nir_build_alu2(
b, nir_atomic_op_to_alu(nir_intrinsic_atomic_op(intr)), loaded, data);
nir_alu_instr *alu = nir_instr_as_alu(to_store->parent_instr);
alu->exact = true;
alu->fp_fast_math = 0;
break;
}
case nir_atomic_op_xchg: {
// op(loaded, data) = data
to_store = data;
break;
}
case nir_atomic_op_cmpxchg: {
// op(loaded, src1, src2) = loaded == src1 ? src2 : loaded;
nir_def *new_data = intr->src[2].ssa;
to_store = nir_bcsel(b, nir_ieq(b, loaded, data), new_data, loaded);
break;
}
case nir_atomic_op_fcmpxchg: /* TODO: shared atomic floats */
default:
unreachable("Invalid intrinsic");
}
return to_store;
}
static nir_def *
build_atomic(nir_builder *b, nir_intrinsic_instr *intr)
{
// TODO: this is currently compiled down to ~20 instructions while
// CUDA can optimize the same code to only ~5.
nir_def *loaded_data;
nir_def *addr = intr->src[0].ssa;
nir_loop *loop = nir_push_loop(b);
{
nir_def *load = nir_load_shared_lock_nv(b, intr->def.bit_size, addr);
loaded_data = nir_channel(b, load, 0);
nir_def *is_locked = nir_u2u32(b, nir_channel(b, load, 1));
nir_if *nif = nir_push_if(b, nir_ine_imm(b, is_locked, 0));
{
nir_def *new_data = lower_atomic_in_lock(b, intr, loaded_data);
nir_def *success = nir_store_shared_unlock_nv(b, 32, new_data, addr);
nir_break_if(b, nir_ine_imm(b, success, 0));
}
nir_pop_if(b, nif);
}
nir_pop_loop(b, loop);
return loaded_data;
}
static bool
nak_nir_lower_kepler_atomics_intrin(nir_builder *b,
nir_intrinsic_instr *intrin,
UNUSED void *_data)
{
if (intrin->intrinsic != nir_intrinsic_shared_atomic &&
intrin->intrinsic != nir_intrinsic_shared_atomic_swap)
return false;
b->cursor = nir_before_instr(&intrin->instr);
nir_def_replace(&intrin->def, build_atomic(b, intrin));
return true;
}
bool
nak_nir_lower_kepler_shared_atomics(nir_shader *nir)
{
return nir_shader_intrinsics_pass(nir, nak_nir_lower_kepler_atomics_intrin,
nir_metadata_none, NULL);
}
+1
View File
@@ -202,6 +202,7 @@ bool nak_nir_lower_scan_reduce(nir_shader *shader);
bool nak_nir_lower_tex(nir_shader *nir, const struct nak_compiler *nak);
bool nak_nir_lower_gs_intrinsics(nir_shader *shader);
bool nak_nir_lower_algebraic_late(nir_shader *nir, const struct nak_compiler *nak);
bool nak_nir_lower_kepler_shared_atomics(nir_shader *shader);
struct nak_nir_attr_io_flags {
bool output : 1;