diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 3e8015267a5..2f4e014f2a9 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -926,6 +926,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_load_sample_mask: case nir_intrinsic_quad_ballot_agx: case nir_intrinsic_load_agx: + case nir_intrinsic_load_shared_lock_nv: + case nir_intrinsic_store_shared_unlock_nv: is_divergent = true; break; diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 99290c0ace9..68ec0949fba 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -2485,6 +2485,10 @@ intrinsic("ldtram_nv", dest_comp=2, bit_sizes=[32], # NVIDIA-specific Image intrinsics image("load_raw_nv", src_comp=[4, 1, 1], extra_indices=[DEST_TYPE], dest_comp=0, flags=[CAN_ELIMINATE]) +# Nvidia Kepler specific load-lock store-unlock +# used to lower shared atomics. +intrinsic("load_shared_lock_nv", src_comp=[1], dest_comp=2) +intrinsic("store_shared_unlock_nv", src_comp=[1, 1], dest_comp=1) # NVIDIA-specific Geometry Shader intrinsics. # These contain an additional integer source and destination with the primitive handle input/output. diff --git a/src/nouveau/compiler/meson.build b/src/nouveau/compiler/meson.build index dabbe478d9d..283ca0354bd 100644 --- a/src/nouveau/compiler/meson.build +++ b/src/nouveau/compiler/meson.build @@ -35,6 +35,7 @@ libnak_c_files = files( 'nak_nir_lower_gs_intrinsics.c', 'nak_nir_lower_non_uniform_ldcx.c', 'nak_nir_lower_scan_reduce.c', + 'nak_nir_lower_kepler_shared_atomics.c', 'nak_nir_lower_tex.c', 'nak_nir_lower_vtg_io.c', 'nak_nir_mark_lcssa_invariants.c', diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c index 977c999670b..e6c164d00a9 100644 --- a/src/nouveau/compiler/nak_nir.c +++ b/src/nouveau/compiler/nak_nir.c @@ -958,7 +958,14 @@ nak_postprocess_nir(nir_shader *nir, .lower_rotate_to_shuffle = true }; OPT(nir, nir_lower_subgroups, &subgroups_options); - OPT(nir, nir_lower_atomics, atomic_supported); + if (nak->sm >= 50) { + // On Maxwell+ we need to lower shared 64-bit atomics into + // compare-and-swap loops + OPT(nir, nir_lower_atomics, atomic_supported); + } else { + // On Kepler we need to lower shared atomics into locked ld-st + OPT(nir, nak_nir_lower_kepler_shared_atomics); + } OPT(nir, nak_nir_lower_scan_reduce); if (nir_shader_has_local_variables(nir)) { diff --git a/src/nouveau/compiler/nak_nir_lower_kepler_shared_atomics.c b/src/nouveau/compiler/nak_nir_lower_kepler_shared_atomics.c new file mode 100644 index 00000000000..b2114a80963 --- /dev/null +++ b/src/nouveau/compiler/nak_nir_lower_kepler_shared_atomics.c @@ -0,0 +1,122 @@ +/* + * Copyright © 2025 Lorenzo Rossi + * SPDX-License-Identifier: MIT + */ + +#include "nak_private.h" +#include "nir.h" +#include "nir_builder.h" + +/* + * Convert atomic arithmetic to regular arithmetic along with mutex locks. + * + * eg: + * atomicAdd(addr, 1) -> + * + * uint expected = a[0]; + * bool success = false; + * do { + * data, is_locked = load_locked(a[0]) + * if (is_locked) { + * data = data + 1; + * success = store_and_unlock(&a[0], data); + * } + * } while (!success); + * + * special_case cmp_exc and exc. + */ + +static nir_def * +lower_atomic_in_lock(nir_builder *b, nir_intrinsic_instr *intr, nir_def *loaded) +{ + // Assume we have the lock, the previous value is in loaded and we must + // compute the value to store in the address. + // to_store = op(loaded, data) + nir_def *data = intr->src[1].ssa; + nir_def *to_store; + + switch (nir_intrinsic_atomic_op(intr)) { + case nir_atomic_op_imin: + case nir_atomic_op_umin: + case nir_atomic_op_imax: + case nir_atomic_op_umax: + case nir_atomic_op_iand: + case nir_atomic_op_ior: + case nir_atomic_op_ixor: + case nir_atomic_op_fadd: + case nir_atomic_op_fmin: + case nir_atomic_op_fmax: + case nir_atomic_op_iadd: { + to_store = nir_build_alu2( + b, nir_atomic_op_to_alu(nir_intrinsic_atomic_op(intr)), loaded, data); + nir_alu_instr *alu = nir_instr_as_alu(to_store->parent_instr); + alu->exact = true; + alu->fp_fast_math = 0; + break; + } + case nir_atomic_op_xchg: { + // op(loaded, data) = data + to_store = data; + break; + } + case nir_atomic_op_cmpxchg: { + // op(loaded, src1, src2) = loaded == src1 ? src2 : loaded; + nir_def *new_data = intr->src[2].ssa; + to_store = nir_bcsel(b, nir_ieq(b, loaded, data), new_data, loaded); + break; + } + case nir_atomic_op_fcmpxchg: /* TODO: shared atomic floats */ + default: + unreachable("Invalid intrinsic"); + } + + return to_store; +} + +static nir_def * +build_atomic(nir_builder *b, nir_intrinsic_instr *intr) +{ + // TODO: this is currently compiled down to ~20 instructions while + // CUDA can optimize the same code to only ~5. + nir_def *loaded_data; + nir_def *addr = intr->src[0].ssa; + + nir_loop *loop = nir_push_loop(b); + { + nir_def *load = nir_load_shared_lock_nv(b, intr->def.bit_size, addr); + + loaded_data = nir_channel(b, load, 0); + nir_def *is_locked = nir_u2u32(b, nir_channel(b, load, 1)); + nir_if *nif = nir_push_if(b, nir_ine_imm(b, is_locked, 0)); + { + nir_def *new_data = lower_atomic_in_lock(b, intr, loaded_data); + nir_def *success = nir_store_shared_unlock_nv(b, 32, new_data, addr); + + nir_break_if(b, nir_ine_imm(b, success, 0)); + } + nir_pop_if(b, nif); + } + nir_pop_loop(b, loop); + return loaded_data; +} + +static bool +nak_nir_lower_kepler_atomics_intrin(nir_builder *b, + nir_intrinsic_instr *intrin, + UNUSED void *_data) +{ + if (intrin->intrinsic != nir_intrinsic_shared_atomic && + intrin->intrinsic != nir_intrinsic_shared_atomic_swap) + return false; + + b->cursor = nir_before_instr(&intrin->instr); + nir_def_replace(&intrin->def, build_atomic(b, intrin)); + return true; +} + +bool +nak_nir_lower_kepler_shared_atomics(nir_shader *nir) +{ + return nir_shader_intrinsics_pass(nir, nak_nir_lower_kepler_atomics_intrin, + nir_metadata_none, NULL); +} diff --git a/src/nouveau/compiler/nak_private.h b/src/nouveau/compiler/nak_private.h index cb43737544a..19df31566c8 100644 --- a/src/nouveau/compiler/nak_private.h +++ b/src/nouveau/compiler/nak_private.h @@ -202,6 +202,7 @@ bool nak_nir_lower_scan_reduce(nir_shader *shader); bool nak_nir_lower_tex(nir_shader *nir, const struct nak_compiler *nak); bool nak_nir_lower_gs_intrinsics(nir_shader *shader); bool nak_nir_lower_algebraic_late(nir_shader *nir, const struct nak_compiler *nak); +bool nak_nir_lower_kepler_shared_atomics(nir_shader *shader); struct nak_nir_attr_io_flags { bool output : 1;