From 1b2cd628b838753ec7faef38746397f35a107b0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 22 Jun 2024 23:07:37 -0400 Subject: [PATCH] nir: rename ordered_xfb_counter_add_gfx12_amd -> ordered_add_loop_gfx12_amd because it can also be used by compute. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/amd/common/ac_nir_lower_ngg.c | 4 +-- src/amd/llvm/ac_nir_to_llvm.c | 13 +++++----- src/compiler/nir/nir_divergence_analysis.c | 2 +- src/compiler/nir/nir_intrinsics.py | 29 ++++++++++++---------- 4 files changed, 25 insertions(+), 23 deletions(-) diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index 43e99ff07e7..e687cc184f7 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -1940,8 +1940,8 @@ ngg_build_streamout_buffer_info(nir_builder *b, */ if (use_gfx12_xfb_intrinsic) { buffer_offset_per_lane = - nir_ordered_xfb_counter_add_gfx12_amd(b, xfb_state_address, xfb_voffset, ordered_id, - atomic_src); + nir_ordered_add_loop_gfx12_amd(b, xfb_state_address, xfb_voffset, ordered_id, + atomic_src); } else { /* The NIR version of the above using nir_atomic_op_ordered_add_gfx12_amd. */ enum { NUM_ATOMICS_IN_FLIGHT = 6 }; diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index 90e561b1034..d6be2b3ae24 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -3600,20 +3600,20 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins } break; } - case nir_intrinsic_ordered_xfb_counter_add_gfx12_amd: { + case nir_intrinsic_ordered_add_loop_gfx12_amd: { const unsigned num_atomics = 6; /* max 8, using v0..v15 as temporaries */ char code[2048]; char *ptr = code; /* Assembly outputs: - * i32 VGPR $0 = dwordsWritten (set in 4 lanes) + * i32 VGPR $0 = previous value in memory * * Assembly inputs: - * EXEC = 0xf (4 lanes, set by nir_push_if()) + * EXEC = one lane per counter (use nir_push_if, streamout should always enable 4 lanes) * i64 SGPR $1 = atomic base address - * i32 VGPR $2 = voffset = 8 * threadIDInGroup + * i32 VGPR $2 = 32-bit VGPR voffset (streamout should set local_invocation_index * 8) * i32 SGPR $3 = orderedID - * i64 VGPR $4 = {orderedID, numDwords} (set in 4 lanes) + * i64 VGPR $4 = 64-bit VGPR atomic src (streamout should set {orderedID, numDwords}) */ /* Issue (num_atomics - 1) atomics to initialize the results. @@ -3639,13 +3639,12 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins unsigned issue_index = (num_atomics - 1 + i) % num_atomics; unsigned read_index = i; - /* result = dwords_written */ ptr += sprintf(ptr, /* Issue (or repeat) the attempt. */ "global_atomic_ordered_add_b64 v[%u:%u], $2, $4, $1 th:TH_ATOMIC_RETURN\n" "s_wait_loadcnt 0x%x\n" /* if (result[check_index].ordered_id == ordered_id) { - * dwords_written = result[check_index].dwords_written; + * return_value = result[check_index].value; * break; * } */ diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index e31553f7ed0..db2d4a404ed 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -721,7 +721,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_load_topology_id_intel: case nir_intrinsic_load_scratch_base_ptr: case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd: - case nir_intrinsic_ordered_xfb_counter_add_gfx12_amd: + case nir_intrinsic_ordered_add_loop_gfx12_amd: case nir_intrinsic_xfb_counter_sub_gfx11_amd: case nir_intrinsic_unit_test_divergent_amd: case nir_intrinsic_load_stack: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 24197079d1d..745bc1c3b44 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1711,23 +1711,26 @@ system_value("ordered_id_amd", 1) # WRITE_MASK = mask for counter channel to update intrinsic("ordered_xfb_counter_add_gfx11_amd", dest_comp=0, src_comp=[1, 0], indices=[WRITE_MASK], bit_sizes=[32]) -# Add dwords_written[] to global streamout offsets. +# Execute the atomic ordered add loop. This does what ds_ordered_count did in previous generations. +# This is implemented with inline assembly to get the most optimal code. +# +# Inputs: +# exec = one lane per counter (use nir_push_if, streamout should always enable 4 lanes) +# src[0] = 64-bit SGPR atomic base address (streamout should use nir_load_xfb_state_address_gfx12_amd) +# src[1] = 32-bit VGPR voffset (streamout should set local_invocation_index * 8) +# src[2] = 32-bit SGPR ordered_id (use nir_load_ordered_id_amd for streamout, compute shaders +# should generated it manually) +# src[3] = 64-bit VGPR atomic src, use pack_64_2x32_split(ordered_id, value), streamout should do: +# pack_64_2x32_split(ordered_id, "dwords written per workgroup" for each buffer) +# +# dst = 32-bit VGPR of the previous value of 32-bit value in memory, returned for all enabled lanes + +# Example - streamout: It's used to add dwords_written[] to global streamout offsets. # * Exactly 4 lanes must be active, one for each buffer binding. # * Disabled buffers must set dwords_written=0 for their lane, but the lane # must be enabled. -# * This is implemented with inline assembly, which is why some parameters -# appear trivial or redundant. # -# Inputs: -# exec = 0xf (set by the caller using nir_push_if) -# src[0] = 64-bit SGPR atomic base address (use nir_load_xfb_state_address_gfx12_amd) -# src[1] = 32-bit VGPR voffset, set in 4 lanes (always local_invocation_index * 8) -# src[2] = 32-bit SGPR ordered_id (use nir_load_ordered_id_amd) -# src[3] = 64-bit VGPR atomic src, set in 4 lanes -# (always pack_64_2x32_split(ordered_id, "dwords written per workgroup" for each buffer)) -# -# dst = 32-bit VGPR of the previous value of dwords_writtenN in memory, returned in 4 lanes -intrinsic("ordered_xfb_counter_add_gfx12_amd", dest_comp=1, src_comp=[1, 1, 1, 1], bit_sizes=[32]) +intrinsic("ordered_add_loop_gfx12_amd", dest_comp=1, src_comp=[1, 1, 1, 1], bit_sizes=[32]) # Subtract from global streamout buffer offsets. Used to fix up the offsets # when we overflow streamout buffers.