From 4263b497783b8ffde3c58d95fd86623cd6dc88a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 2 Jun 2025 22:27:41 -0400 Subject: [PATCH] ac/nir: remove ngg_scratch LDS ABI, allocate it in the lowering pass This is a cleanup. Old gs LDS layout: [es outputs][gs outputs][scratch] Old nogs LDS layout: [xfb/cull][scratch] New gs LDS layout: [es outputs][scratch|gs outputs] New nogs LDS layout: [scratch|xfb/cull] The LDS scratch is moved to the beginning of the preceding buffer in LDS, while the addresses in that LDS buffer are offset by the scratch size. It effectively merges the LDS scratch with the preceding buffer in LDS. Thanks to that, we no longer need the ngg_scratch ABI and the offset in a user SGPR. The lowering passes now return the LDS scratch size, which is used by the drivers to determine the final LDS size. The ngg_lds_layout SGPR is now unused without GS in RADV. Part-of: --- src/amd/common/nir/ac_nir.h | 4 +- src/amd/common/nir/ac_nir_lower_ngg.c | 39 ++++++++++--------- src/amd/common/nir/ac_nir_lower_ngg_gs.c | 20 ++++++---- src/amd/compiler/aco_shader_info.h | 1 - .../instruction_selection/aco_isel_setup.cpp | 1 - .../aco_select_nir_intrinsics.cpp | 6 --- src/amd/llvm/ac_nir_to_llvm.c | 1 - src/amd/vulkan/nir/radv_nir_lower_abi.c | 3 -- src/amd/vulkan/radv_cmd_buffer.c | 6 +-- src/amd/vulkan/radv_nir_to_llvm.c | 9 +---- src/amd/vulkan/radv_shader.c | 11 ++---- src/amd/vulkan/radv_shader.h | 2 - src/amd/vulkan/radv_shader_info.c | 19 +++------ src/amd/vulkan/radv_shader_info.h | 2 +- src/compiler/nir/nir_divergence_analysis.c | 1 - src/compiler/nir/nir_intrinsics.py | 2 - .../drivers/radeonsi/gfx10_shader_ngg.c | 22 ++++------- src/gallium/drivers/radeonsi/si_shader.c | 15 ++----- src/gallium/drivers/radeonsi/si_shader.h | 2 +- src/gallium/drivers/radeonsi/si_shader_aco.c | 7 ---- src/gallium/drivers/radeonsi/si_shader_info.h | 1 + .../drivers/radeonsi/si_shader_internal.h | 1 - src/gallium/drivers/radeonsi/si_shader_llvm.c | 26 ------------- src/gallium/drivers/radeonsi/si_shader_llvm.h | 1 - 24 files changed, 60 insertions(+), 142 deletions(-) diff --git a/src/amd/common/nir/ac_nir.h b/src/amd/common/nir/ac_nir.h index 47a30ab9d92..537eb2491eb 100644 --- a/src/amd/common/nir/ac_nir.h +++ b/src/amd/common/nir/ac_nir.h @@ -217,11 +217,11 @@ typedef struct { bool ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *options, - uint32_t *out_lds_vertex_size); + uint32_t *out_lds_vertex_size, uint8_t *out_lds_scratch_size); bool ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options, - uint32_t *out_lds_vertex_size); + uint32_t *out_lds_vertex_size, uint8_t *out_lds_scratch_size); bool ac_nir_lower_ngg_mesh(nir_shader *shader, diff --git a/src/amd/common/nir/ac_nir_lower_ngg.c b/src/amd/common/nir/ac_nir_lower_ngg.c index 6ebbeaeb942..a044bd7af29 100644 --- a/src/amd/common/nir/ac_nir_lower_ngg.c +++ b/src/amd/common/nir/ac_nir_lower_ngg.c @@ -74,6 +74,7 @@ typedef struct /* LDS params */ unsigned pervertex_lds_bytes; + unsigned lds_scratch_size; nir_variable *repacked_rel_patch_id; @@ -108,9 +109,9 @@ enum { }; static nir_def * -pervertex_lds_addr(nir_builder *b, nir_def *vertex_idx, unsigned per_vtx_bytes) +pervertex_lds_addr(nir_builder *b, lower_ngg_nogs_state *s, nir_def *vertex_idx, unsigned per_vtx_bytes) { - return nir_imul_imm(b, vertex_idx, per_vtx_bytes); + return nir_iadd_imm_nuw(b, nir_imul_imm(b, vertex_idx, per_vtx_bytes), s->lds_scratch_size); } static void @@ -206,7 +207,7 @@ emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *s, nir_def *arg) for (int i = 0; i < s->options->num_vertices_per_primitive; i++) { nir_def *vtx_idx = nir_load_var(b, s->gs_vtx_indices_vars[i]); - nir_def *addr = pervertex_lds_addr(b, vtx_idx, s->pervertex_lds_bytes); + nir_def *addr = pervertex_lds_addr(b, s, vtx_idx, s->pervertex_lds_bytes); /* Edge flags share LDS with XFB. */ nir_def *edge = ac_nir_load_shared_xfb(b, 32, addr, &s->out, VARYING_SLOT_EDGE, 0); @@ -261,7 +262,7 @@ emit_ngg_nogs_prim_id_store_shared(nir_builder *b, lower_ngg_nogs_state *s) b, gs_vtx_indices, s->options->num_vertices_per_primitive, provoking_vertex); nir_def *prim_id = nir_load_primitive_id(b); - nir_def *addr = pervertex_lds_addr(b, provoking_vtx_idx, s->pervertex_lds_bytes); + nir_def *addr = pervertex_lds_addr(b, s, provoking_vtx_idx, s->pervertex_lds_bytes); /* primitive id is always at last of a vertex */ nir_store_shared(b, prim_id, addr, .base = s->pervertex_lds_bytes - 4); @@ -301,7 +302,7 @@ emit_store_ngg_nogs_es_primitive_id(nir_builder *b, lower_ngg_nogs_state *s) /* LDS address where the primitive ID is stored */ nir_def *thread_id_in_threadgroup = nir_load_local_invocation_index(b); nir_def *addr = - pervertex_lds_addr(b, thread_id_in_threadgroup, s->pervertex_lds_bytes); + pervertex_lds_addr(b, s, thread_id_in_threadgroup, s->pervertex_lds_bytes); /* Load primitive ID from LDS */ prim_id = nir_load_shared(b, 1, 32, addr, .base = s->pervertex_lds_bytes - 4); @@ -478,7 +479,7 @@ compact_vertices_after_culling(nir_builder *b, { nir_if *if_es_accepted = nir_push_if(b, nir_load_var(b, s->es_accepted_var)); { - nir_def *exporter_addr = pervertex_lds_addr(b, es_exporter_tid, pervertex_lds_bytes); + nir_def *exporter_addr = pervertex_lds_addr(b, s, es_exporter_tid, pervertex_lds_bytes); /* Store the exporter thread's index to the LDS space of the current thread so GS threads can load it */ nir_store_shared(b, nir_u2u8(b, es_exporter_tid), es_vertex_lds_addr, .base = lds_es_exporter_tid); @@ -568,7 +569,7 @@ compact_vertices_after_culling(nir_builder *b, if_gs_accepted = nir_push_if(b, gs_accepted); { - nir_def *exporter_addr = pervertex_lds_addr(b, gs_exporter_tid, pervertex_lds_bytes); + nir_def *exporter_addr = pervertex_lds_addr(b, s, gs_exporter_tid, pervertex_lds_bytes); nir_def *prim_exp_arg = nir_load_var(b, s->prim_exp_arg_var); /* Store the primitive export argument into the address of the exporter thread. */ @@ -1100,8 +1101,6 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c remove_culling_shader_outputs(b->shader, s); b->cursor = nir_after_impl(impl); - nir_def *lds_scratch_base = nir_load_lds_ngg_scratch_base_amd(b); - /* Run culling algorithms if culling is enabled. * * NGG culling can be enabled or disabled in runtime. @@ -1112,7 +1111,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c nir_if *if_cull_en = nir_push_if(b, nir_load_cull_any_enabled_amd(b)); { nir_def *invocation_index = nir_load_local_invocation_index(b); - nir_def *es_vertex_lds_addr = pervertex_lds_addr(b, invocation_index, pervertex_lds_bytes); + nir_def *es_vertex_lds_addr = pervertex_lds_addr(b, s, invocation_index, pervertex_lds_bytes); /* ES invocations store their vertex data to LDS for GS threads to read. */ if_es_thread = nir_push_if(b, es_thread); @@ -1153,7 +1152,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c /* Load W positions of vertices first because the culling code will use these first */ for (unsigned vtx = 0; vtx < s->options->num_vertices_per_primitive; ++vtx) { - s->vtx_addr[vtx] = pervertex_lds_addr(b, vtx_idx[vtx], pervertex_lds_bytes); + s->vtx_addr[vtx] = pervertex_lds_addr(b, s, vtx_idx[vtx], pervertex_lds_bytes); pos[vtx][3] = nir_load_shared(b, 1, 32, s->vtx_addr[vtx], .base = lds_es_pos_w); nir_store_var(b, gs_vtxaddr_vars[vtx], s->vtx_addr[vtx], 0x1u); } @@ -1209,7 +1208,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c nir_def *accepted[] = { es_accepted, gs_accepted }; ac_nir_wg_repack_result rep[2] = {0}; const unsigned num_rep = s->options->compact_primitives ? 2 : 1; - ac_nir_repack_invocations_in_workgroup(b, accepted, rep, num_rep, lds_scratch_base, + ac_nir_repack_invocations_in_workgroup(b, accepted, rep, num_rep, nir_imm_int(b, 0), s->max_num_waves, s->options->wave_size); nir_def *num_live_vertices_in_workgroup = rep[0].num_repacked_invocations; nir_def *es_exporter_tid = rep[0].repacked_invocation_index; @@ -1299,7 +1298,7 @@ ngg_nogs_store_edgeflag_to_lds(nir_builder *b, lower_ngg_nogs_state *s) edgeflag = nir_umin(b, edgeflag, nir_imm_int(b, 1)); nir_def *tid = nir_load_local_invocation_index(b); - nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes); + nir_def *addr = pervertex_lds_addr(b, s, tid, s->pervertex_lds_bytes); /* Edge flags share LDS with XFB. */ ac_nir_store_shared_xfb(b, edgeflag, addr, &s->out, VARYING_SLOT_EDGE, 0); @@ -1335,7 +1334,7 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s) } nir_def *tid = nir_load_local_invocation_index(b); - nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes); + nir_def *addr = pervertex_lds_addr(b, s, tid, s->pervertex_lds_bytes); u_foreach_bit64(slot, xfb_outputs) { u_foreach_bit(c, xfb_mask[slot]) { @@ -1377,8 +1376,6 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s) { nir_xfb_info *info = ac_nir_get_sorted_xfb_info(b->shader); - nir_def *lds_scratch_base = nir_load_lds_ngg_scratch_base_amd(b); - /* Get global buffer offset where this workgroup will stream out data to. */ nir_def *generated_prim = nir_load_workgroup_num_input_primitives_amd(b); nir_def *gen_prim_per_stream[4] = {generated_prim, 0, 0, 0}; @@ -1387,7 +1384,7 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s) nir_def *so_buffer[4] = {0}; nir_def *tid_in_tg = nir_load_local_invocation_index(b); ac_nir_ngg_build_streamout_buffer_info(b, info, s->options->hw_info->gfx_level, s->options->has_xfb_prim_query, - s->options->use_gfx12_xfb_intrinsic, lds_scratch_base, tid_in_tg, + s->options->use_gfx12_xfb_intrinsic, nir_imm_int(b, 0), tid_in_tg, gen_prim_per_stream, so_buffer, buffer_offsets, emit_prim_per_stream); @@ -1409,7 +1406,7 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s) nir_push_if(b, nir_igt_imm(b, num_vert_per_prim, i)); { nir_def *vtx_lds_idx = nir_load_var(b, s->gs_vtx_indices_vars[i]); - nir_def *vtx_lds_addr = pervertex_lds_addr(b, vtx_lds_idx, s->pervertex_lds_bytes); + nir_def *vtx_lds_addr = pervertex_lds_addr(b, s, vtx_lds_idx, s->pervertex_lds_bytes); ac_nir_ngg_build_streamout_vertex(b, info, 0, so_buffer, buffer_offsets, i, vtx_lds_addr, &s->out); } @@ -1507,7 +1504,7 @@ ac_ngg_nogs_get_pervertex_lds_size(lower_ngg_nogs_state *s, bool ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *options, - uint32_t *out_lds_vertex_size) + uint32_t *out_lds_vertex_size, uint8_t *out_lds_scratch_size) { nir_function_impl *impl = nir_shader_get_entrypoint(shader); assert(impl); @@ -1553,6 +1550,9 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option .gs_exported_var = gs_exported_var, .max_num_waves = DIV_ROUND_UP(options->max_workgroup_size, options->wave_size), .has_user_edgeflags = has_user_edgeflags, + .lds_scratch_size = ac_ngg_get_scratch_lds_size(shader->info.stage, options->max_workgroup_size, + options->wave_size, streamout_enabled, + options->can_cull, options->compact_primitives), }; /* Can't export the primitive ID both as per-vertex and per-primitive. */ @@ -1823,6 +1823,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option options->export_primitive_id, state.has_user_edgeflags, options->can_cull, state.deferred.uses_instance_id, state.deferred.uses_tess_primitive_id); + *out_lds_scratch_size = state.lds_scratch_size; return true; } diff --git a/src/amd/common/nir/ac_nir_lower_ngg_gs.c b/src/amd/common/nir/ac_nir_lower_ngg_gs.c index 1e7db649cd8..b5889e00a2d 100644 --- a/src/amd/common/nir/ac_nir_lower_ngg_gs.c +++ b/src/amd/common/nir/ac_nir_lower_ngg_gs.c @@ -23,9 +23,9 @@ typedef struct unsigned max_num_waves; unsigned num_vertices_per_primitive; nir_def *lds_addr_gs_out_vtx; - nir_def *lds_addr_gs_scratch; unsigned lds_bytes_per_gs_out_vertex; unsigned lds_offs_primflags; + unsigned lds_scratch_size; bool output_compile_time_known; bool streamout_enabled; /* Outputs */ @@ -79,7 +79,7 @@ ngg_gs_out_vertex_addr(nir_builder *b, nir_def *out_vtx_idx, lower_ngg_gs_state } nir_def *out_vtx_offs = nir_imul_imm(b, out_vtx_idx, s->lds_bytes_per_gs_out_vertex); - return nir_iadd_nuw(b, out_vtx_offs, s->lds_addr_gs_out_vtx); + return nir_iadd_nuw(b, out_vtx_offs, nir_iadd_imm_nuw(b, s->lds_addr_gs_out_vtx, s->lds_scratch_size)); } static nir_def * @@ -739,7 +739,7 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s) unsigned scratch_stride = ALIGN(s->max_num_waves, 4); nir_def *scratch_base = - nir_iadd_imm(b, s->lds_addr_gs_scratch, stream * scratch_stride); + nir_iadd_imm(b, s->lds_addr_gs_out_vtx, stream * scratch_stride); /* We want to export primitives to streamout buffer in sequence, * but not all vertices are alive or mark end of a primitive, so @@ -776,7 +776,7 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s) nir_def *buffer_offsets[4] = {0}; nir_def *so_buffer[4] = {0}; ac_nir_ngg_build_streamout_buffer_info(b, info, s->options->hw_info->gfx_level, s->options->has_xfb_prim_query, - s->options->use_gfx12_xfb_intrinsic, s->lds_addr_gs_scratch, tid_in_tg, + s->options->use_gfx12_xfb_intrinsic, s->lds_addr_gs_out_vtx, tid_in_tg, gen_prim, so_buffer, buffer_offsets, emit_prim); for (unsigned stream = 0; stream < 4; stream++) { @@ -871,7 +871,7 @@ ngg_gs_finale(nir_builder *b, lower_ngg_gs_state *s) nir_def *vertex_live = nir_ine_imm(b, out_vtx_primflag_0, 0); ac_nir_wg_repack_result rep = {0}; - ac_nir_repack_invocations_in_workgroup(b, &vertex_live, &rep, 1, s->lds_addr_gs_scratch, + ac_nir_repack_invocations_in_workgroup(b, &vertex_live, &rep, 1, s->lds_addr_gs_out_vtx, s->max_num_waves, s->options->wave_size); nir_def *workgroup_num_vertices = rep.num_repacked_invocations; @@ -900,16 +900,20 @@ ngg_gs_finale(nir_builder *b, lower_ngg_gs_state *s) bool ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options, - uint32_t *out_lds_vertex_size) + uint32_t *out_lds_vertex_size, uint8_t *out_lds_scratch_size) { nir_function_impl *impl = nir_shader_get_entrypoint(shader); assert(impl); + bool streamout_enabled = shader->xfb_info && !options->disable_streamout; lower_ngg_gs_state state = { .options = options, .impl = impl, .max_num_waves = DIV_ROUND_UP(options->max_workgroup_size, options->wave_size), - .streamout_enabled = shader->xfb_info && !options->disable_streamout, + .streamout_enabled = streamout_enabled, + .lds_scratch_size = ac_ngg_get_scratch_lds_size(shader->info.stage, options->max_workgroup_size, + options->wave_size, streamout_enabled, + options->can_cull, options->compact_primitives), }; if (!options->can_cull) { @@ -940,7 +944,6 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options, .memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared); state.lds_addr_gs_out_vtx = nir_load_lds_ngg_gs_out_vertex_base_amd(b); - state.lds_addr_gs_scratch = nir_load_lds_ngg_scratch_base_amd(b); /* Wrap the GS control flow. */ nir_if *if_gs_thread = nir_push_if(b, nir_is_subgroup_invocation_lt_amd(b, nir_load_merged_wave_info_amd(b), .base = 8)); @@ -998,5 +1001,6 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options, nir_remove_dead_variables(shader, nir_var_function_temp, NULL); *out_lds_vertex_size = state.lds_bytes_per_gs_out_vertex; + *out_lds_scratch_size = state.lds_scratch_size; return nir_progress(true, impl, nir_metadata_none); } diff --git a/src/amd/compiler/aco_shader_info.h b/src/amd/compiler/aco_shader_info.h index eb97834738e..0f93944a56f 100644 --- a/src/amd/compiler/aco_shader_info.h +++ b/src/amd/compiler/aco_shader_info.h @@ -188,7 +188,6 @@ enum aco_symbol_id { aco_symbol_invalid, aco_symbol_scratch_addr_lo, aco_symbol_scratch_addr_hi, - aco_symbol_lds_ngg_scratch_base, aco_symbol_lds_ngg_gs_out_vertex_base, aco_symbol_const_data_addr, }; diff --git a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp index 2f3ef3270e6..93431691b96 100644 --- a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp +++ b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp @@ -540,7 +540,6 @@ init_context(isel_context* ctx, nir_shader* shader) case nir_intrinsic_ballot_relaxed: case nir_intrinsic_bindless_image_samples: case nir_intrinsic_load_scalar_arg_amd: - case nir_intrinsic_load_lds_ngg_scratch_base_amd: case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: case nir_intrinsic_load_smem_amd: case nir_intrinsic_unit_test_uniform_amd: type = RegType::sgpr; break; diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp index 8b40ac210b0..660b16c34e2 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp @@ -4932,12 +4932,6 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) ctx->block->instructions.emplace_back(std::move(vec)); break; } - case nir_intrinsic_load_lds_ngg_scratch_base_amd: { - Temp dst = get_ssa_temp(ctx, &instr->def); - bld.sop1(aco_opcode::p_load_symbol, Definition(dst), - Operand::c32(aco_symbol_lds_ngg_scratch_base)); - break; - } case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: { Temp dst = get_ssa_temp(ctx, &instr->def); bld.sop1(aco_opcode::p_load_symbol, Definition(dst), diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index 0233b672872..22f5940b879 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -2768,7 +2768,6 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins result = ac_build_gather_values(&ctx->ac, values, 3); break; } - case nir_intrinsic_load_lds_ngg_scratch_base_amd: case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: result = ctx->abi->intrinsic_load(ctx->abi, instr); break; diff --git a/src/amd/vulkan/nir/radv_nir_lower_abi.c b/src/amd/vulkan/nir/radv_nir_lower_abi.c index aba35129b4d..23cf8e75752 100644 --- a/src/amd/vulkan/nir/radv_nir_lower_abi.c +++ b/src/amd/vulkan/nir/radv_nir_lower_abi.c @@ -370,9 +370,6 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state) case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: replacement = GET_SGPR_FIELD_NIR(s->args->ngg_lds_layout, NGG_LDS_LAYOUT_GS_OUT_VERTEX_BASE); break; - case nir_intrinsic_load_lds_ngg_scratch_base_amd: - replacement = GET_SGPR_FIELD_NIR(s->args->ngg_lds_layout, NGG_LDS_LAYOUT_SCRATCH_BASE); - break; case nir_intrinsic_load_num_vertices_per_primitive_amd: { unsigned num_vertices; diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index a9569549842..3f816125467 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -2269,12 +2269,10 @@ radv_emit_hw_ngg(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *e const uint32_t ngg_lds_layout_offset = radv_get_user_sgpr_loc(shader, AC_UD_NGG_LDS_LAYOUT); assert(ngg_lds_layout_offset); - assert(!(shader->info.ngg_info.esgs_ring_size & 0xffff0000) && - !(shader->info.ngg_info.scratch_lds_base & 0xffff0000)); + assert(!(shader->info.ngg_info.esgs_ring_size & 0xffff0000)); radeon_set_sh_reg(ngg_lds_layout_offset, - SET_SGPR_FIELD(NGG_LDS_LAYOUT_GS_OUT_VERTEX_BASE, shader->info.ngg_info.esgs_ring_size) | - SET_SGPR_FIELD(NGG_LDS_LAYOUT_SCRATCH_BASE, shader->info.ngg_info.scratch_lds_base)); + SET_SGPR_FIELD(NGG_LDS_LAYOUT_GS_OUT_VERTEX_BASE, shader->info.ngg_info.esgs_ring_size)); radeon_end(); } diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 059d8795e7b..f11594cd8e0 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -275,15 +275,8 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, const struct radv_nir declare_esgs_ring(&ctx); if (ctx.stage == MESA_SHADER_GEOMETRY) { - /* Scratch space used by NGG GS for repacking vertices at the end. */ - LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8); - LLVMValueRef gs_ngg_scratch = - LLVMAddGlobalInAddressSpace(ctx.ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS); - LLVMSetInitializer(gs_ngg_scratch, LLVMGetUndef(ai32)); - LLVMSetLinkage(gs_ngg_scratch, LLVMExternalLinkage); - LLVMSetAlignment(gs_ngg_scratch, 4); - /* Vertex emit space used by NGG GS for storing all vertex attributes. */ + LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8); LLVMValueRef gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx.ac.module, LLVMArrayType(ctx.ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS); LLVMSetInitializer(gs_ngg_emit, LLVMGetUndef(ai32)); diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 071af4c9c44..b8ec1d59009 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -815,11 +815,13 @@ radv_lower_ngg(struct radv_device *device, struct radv_shader_stage *ngg_stage, options.export_primitive_id_per_prim = info->outinfo.export_prim_id_per_primitive; options.instance_rate_inputs = gfx_state->vi.instance_rate_inputs << VERT_ATTRIB_GENERIC0; - NIR_PASS(_, nir, ac_nir_lower_ngg_nogs, &options, &ngg_stage->info.ngg_lds_vertex_size); + NIR_PASS(_, nir, ac_nir_lower_ngg_nogs, &options, &ngg_stage->info.ngg_lds_vertex_size, + &ngg_stage->info.ngg_lds_scratch_size); } else if (nir->info.stage == MESA_SHADER_GEOMETRY) { assert(info->is_ngg); - NIR_PASS(_, nir, ac_nir_lower_ngg_gs, &options, &ngg_stage->info.ngg_lds_vertex_size); + NIR_PASS(_, nir, ac_nir_lower_ngg_gs, &options, &ngg_stage->info.ngg_lds_vertex_size, + &ngg_stage->info.ngg_lds_scratch_size); } else if (nir->info.stage == MESA_SHADER_MESH) { /* ACO aligns the workgroup size to the wave size. */ unsigned hw_workgroup_size = ALIGN(info->workgroup_size, info->wave_size); @@ -1407,11 +1409,6 @@ radv_open_rtld_binary(struct radv_device *device, const struct radv_shader_binar sym->name = "ngg_emit"; sym->size = binary->info.ngg_info.ngg_emit_size * 4; sym->align = 4; - - sym = &lds_symbols[num_lds_symbols++]; - sym->name = "ngg_scratch"; - sym->size = 8; - sym->align = 4; } struct ac_rtld_open_info open_info = { diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index d74bb9d60b7..2c3ee50246f 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -226,8 +226,6 @@ struct radv_nir_compiler_options { #define NGG_LDS_LAYOUT_GS_OUT_VERTEX_BASE__SHIFT 0 #define NGG_LDS_LAYOUT_GS_OUT_VERTEX_BASE__MASK 0xffff -#define NGG_LDS_LAYOUT_SCRATCH_BASE__SHIFT 16 -#define NGG_LDS_LAYOUT_SCRATCH_BASE__MASK 0xffff #define NGG_STATE_NUM_VERTS_PER_PRIM__SHIFT 0 #define NGG_STATE_NUM_VERTS_PER_PRIM__MASK 0x7 diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index f6a150a3e4b..8e69c66844d 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -1421,24 +1421,20 @@ radv_get_pre_rast_input_topology(const struct radv_shader_info *es_info, const s } static unsigned -gfx10_get_ngg_scratch_lds_base(const struct radv_device *device, const struct radv_shader_info *es_info, - const struct radv_shader_info *gs_info, const struct gfx10_ngg_info *ngg_info) +gfx10_get_ngg_vert_prim_lds_size(const struct radv_device *device, const struct radv_shader_info *es_info, + const struct radv_shader_info *gs_info, const struct gfx10_ngg_info *ngg_info) { - uint32_t scratch_lds_base; - if (gs_info) { const unsigned esgs_ring_lds_bytes = ngg_info->esgs_ring_size; const unsigned gs_total_out_vtx_bytes = ngg_info->ngg_emit_size * 4u; - scratch_lds_base = ALIGN(esgs_ring_lds_bytes + gs_total_out_vtx_bytes, 8u /* for the repacking code */); + return esgs_ring_lds_bytes + gs_total_out_vtx_bytes; } else { assert(ngg_info->hw_max_esverts <= 256); unsigned total_es_lds_bytes = es_info->ngg_lds_vertex_size * ngg_info->hw_max_esverts; - scratch_lds_base = ALIGN(total_es_lds_bytes, 8u); + return total_es_lds_bytes; } - - return scratch_lds_base; } void @@ -1648,13 +1644,8 @@ gfx10_get_ngg_info(const struct radv_device *device, struct radv_shader_info *es assert(out->hw_max_esverts >= min_esverts); /* HW limitation */ - out->scratch_lds_base = gfx10_get_ngg_scratch_lds_base(device, es_info, gs_info, out); - - /* Get scratch LDS usage. */ const struct radv_shader_info *info = gs_info ? gs_info : es_info; - const unsigned scratch_lds_size = ac_ngg_get_scratch_lds_size(info->stage, info->workgroup_size, info->wave_size, - pdev->use_ngg_streamout, info->has_ngg_culling, false); - out->lds_size = out->scratch_lds_base + scratch_lds_size; + out->lds_size = info->ngg_lds_scratch_size + gfx10_get_ngg_vert_prim_lds_size(device, es_info, gs_info, out); unsigned workgroup_size = ac_compute_ngg_workgroup_size(max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor); diff --git a/src/amd/vulkan/radv_shader_info.h b/src/amd/vulkan/radv_shader_info.h index f72ea8af4af..2a4996f14f8 100644 --- a/src/amd/vulkan/radv_shader_info.h +++ b/src/amd/vulkan/radv_shader_info.h @@ -75,7 +75,6 @@ struct gfx10_ngg_info { uint32_t prim_amp_factor; uint32_t vgt_esgs_ring_itemsize; uint32_t esgs_ring_size; - uint32_t scratch_lds_base; uint32_t lds_size; bool max_vert_out_per_gs_instance; }; @@ -98,6 +97,7 @@ struct radv_shader_info { bool has_ngg_early_prim_export; bool has_prim_query; bool has_xfb_query; + uint8_t ngg_lds_scratch_size; uint32_t num_tess_patches; uint32_t esgs_itemsize; /* Only for VS or TES as ES */ uint32_t ngg_lds_vertex_size; /* VS,TES: Cull+XFB, GS: GSVS size */ diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 593f38e9a8f..dc5800e5daa 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -321,7 +321,6 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_load_ordered_id_amd: case nir_intrinsic_load_gs_wave_id_amd: case nir_intrinsic_load_provoking_vtx_in_prim_amd: - case nir_intrinsic_load_lds_ngg_scratch_base_amd: case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: case nir_intrinsic_load_btd_shader_type_intel: case nir_intrinsic_load_base_global_invocation_id: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 1f9cb1c6915..700dc166896 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -2000,8 +2000,6 @@ intrinsic("atomic_add_xfb_prim_count_amd", [1], indices=[STREAM_ID]) # src[] = { invocation_count }. intrinsic("atomic_add_shader_invocation_count_amd", [1]) -# LDS offset for scratch section in NGG shader -system_value("lds_ngg_scratch_base_amd", 1) # LDS offset for NGG GS shader vertex emit system_value("lds_ngg_gs_out_vertex_base_amd", 1) diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 308ebb5f697..22998c2594d 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -34,18 +34,6 @@ static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse); } -unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader) -{ - const struct si_shader_selector *sel = shader->selector; - - return ac_ngg_get_scratch_lds_size(sel->stage, - si_get_max_workgroup_size(shader), - shader->wave_size, - shader->info.num_streamout_vec4s != 0, - si_shader_culling_enabled(shader), - false) / 4; -} - /** * Determine subgroup information like maximum number of vertices and prims. * @@ -65,9 +53,8 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader) const unsigned min_verts_per_prim = gs_stage == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1; /* All these are in dwords. The maximum is 16K dwords (64KB) of LDS per workgroup. */ - const unsigned scratch_lds_size = gfx10_ngg_get_scratch_dw_size(shader); - /* Scratch is at last of LDS space and 2 dwords aligned, so it may cost more for alignment. */ - const unsigned max_lds_size = 16 * 1024 - ALIGN(scratch_lds_size, 2); + /* The LDS scratch is at the beginning of LDS space. */ + const unsigned max_lds_size = 16 * 1024 - shader->info.ngg_lds_scratch_size / 4; const unsigned target_lds_size = max_lds_size; unsigned esvert_lds_size = 0; unsigned gsprim_lds_size = 0; @@ -201,6 +188,11 @@ retry_select_mode: esvert_lds_size; shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size; + if (gs_stage == MESA_SHADER_GEOMETRY) + shader->ngg.ngg_emit_size += shader->info.ngg_lds_scratch_size / 4; + else + shader->gs_info.esgs_ring_size += shader->info.ngg_lds_scratch_size / 4; + assert(shader->ngg.hw_max_esverts >= min_esverts); /* HW limitation */ /* If asserts are disabled, we use the same conditions to return false */ diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 972eceaba6f..108520f492e 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -459,15 +459,6 @@ static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shade if (stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) size_in_dw += shader->ngg.ngg_emit_size; - if (shader->key.ge.as_ngg) { - unsigned scratch_dw_size = gfx10_ngg_get_scratch_dw_size(shader); - if (scratch_dw_size) { - /* scratch base address needs to be 8 byte aligned */ - size_in_dw = ALIGN(size_in_dw, 2); - size_in_dw += scratch_dw_size; - } - } - shader->config.lds_size = DIV_ROUND_UP(size_in_dw * 4, get_lds_granularity(sscreen, stage)); } @@ -1172,7 +1163,8 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir, options.instance_rate_inputs = instance_rate_inputs; options.cull_clipdist_mask = clip_plane_enable; - NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options, &shader->info.ngg_lds_vertex_size); + NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options, &shader->info.ngg_lds_vertex_size, + &shader->info.ngg_lds_scratch_size); } else { assert(nir->info.stage == MESA_SHADER_GEOMETRY); @@ -1185,7 +1177,8 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir, if (key->ge.part.gs.es) nir->info.writes_memory |= key->ge.part.gs.es->info.base.writes_memory; - NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options, &shader->info.ngg_lds_vertex_size); + NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options, &shader->info.ngg_lds_vertex_size, + &shader->info.ngg_lds_scratch_size); } /* may generate some vector output store */ diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 76b2e18d788..be3baa3d1bd 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -757,7 +757,7 @@ struct gfx9_gs_info { unsigned gs_prims_per_subgroup; unsigned gs_inst_prims_in_subgroup; unsigned max_prims_per_subgroup; - unsigned esgs_ring_size; /* in bytes */ + unsigned esgs_ring_size; /* in dwords */ }; struct si_shader { diff --git a/src/gallium/drivers/radeonsi/si_shader_aco.c b/src/gallium/drivers/radeonsi/si_shader_aco.c index 7f13b322a8d..0ce57cf3e7a 100644 --- a/src/gallium/drivers/radeonsi/si_shader_aco.c +++ b/src/gallium/drivers/radeonsi/si_shader_aco.c @@ -207,13 +207,6 @@ si_aco_resolve_symbols(struct si_shader *shader, uint32_t *code_for_write, else value |= S_008F04_SWIZZLE_ENABLE_GFX6(1); break; - case aco_symbol_lds_ngg_scratch_base: - assert(sel->stage <= MESA_SHADER_GEOMETRY && key->ge.as_ngg); - value = shader->gs_info.esgs_ring_size * 4; - if (sel->stage == MESA_SHADER_GEOMETRY) - value += shader->ngg.ngg_emit_size * 4; - value = ALIGN(value, 8); - break; case aco_symbol_lds_ngg_gs_out_vertex_base: assert(sel->stage == MESA_SHADER_GEOMETRY && key->ge.as_ngg); value = shader->gs_info.esgs_ring_size * 4; diff --git a/src/gallium/drivers/radeonsi/si_shader_info.h b/src/gallium/drivers/radeonsi/si_shader_info.h index 47d84f5e378..1563f22b696 100644 --- a/src/gallium/drivers/radeonsi/si_shader_info.h +++ b/src/gallium/drivers/radeonsi/si_shader_info.h @@ -237,6 +237,7 @@ struct si_shader_variant_info { uint8_t nr_pos_exports; uint8_t nr_param_exports; uint8_t num_streamout_vec4s; + uint8_t ngg_lds_scratch_size; unsigned private_mem_vgprs; unsigned max_simd_waves; uint32_t ngg_lds_vertex_size; /* VS,TES: Cull+XFB, GS: GSVS size */ diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 876d2d39cab..7da9019999f 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -108,7 +108,6 @@ void si_get_ps_epilog_args(struct si_shader_args *args, /* gfx10_shader_ngg.c */ bool gfx10_ngg_export_prim_early(struct si_shader *shader); -unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader); bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader); struct nir_def; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 64dbc1b1d51..500eb368f5e 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -447,9 +447,6 @@ static LLVMValueRef si_llvm_load_intrinsic(struct ac_shader_abi *abi, nir_intrin struct si_shader_context *ctx = si_shader_context_from_abi(abi); switch (intrin->intrinsic) { - case nir_intrinsic_load_lds_ngg_scratch_base_amd: - return LLVMBuildPtrToInt(ctx->ac.builder, ctx->gs_ngg_scratch.value, ctx->ac.i32, ""); - case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: return LLVMBuildPtrToInt(ctx->ac.builder, ctx->gs_ngg_emit, ctx->ac.i32, ""); @@ -528,14 +525,6 @@ static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shade case MESA_SHADER_GEOMETRY: if (ctx->shader->key.ge.as_ngg) { - LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader)); - ctx->gs_ngg_scratch = (struct ac_llvm_pointer) { - .value = LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS), - .pointee_type = ai32 - }; - LLVMSetInitializer(ctx->gs_ngg_scratch.value, LLVMGetUndef(ai32)); - LLVMSetAlignment(ctx->gs_ngg_scratch.value, 8); - ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace( ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS); LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage); @@ -577,21 +566,6 @@ static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shade if (is_merged_esgs_stage || is_nogs_ngg_stage) si_llvm_declare_lds_esgs_ring(ctx); - /* This is really only needed when streamout and / or vertex - * compaction is enabled. - */ - if (is_nogs_ngg_stage && - (shader->info.num_streamout_vec4s || si_shader_culling_enabled(shader))) { - LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader)); - ctx->gs_ngg_scratch = (struct ac_llvm_pointer) { - .value = LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", - AC_ADDR_SPACE_LDS), - .pointee_type = asi32 - }; - LLVMSetInitializer(ctx->gs_ngg_scratch.value, LLVMGetUndef(asi32)); - LLVMSetAlignment(ctx->gs_ngg_scratch.value, 8); - } - /* For merged shaders (VS-TCS, VS-GS, TES-GS): */ if (ctx->screen->info.gfx_level >= GFX9 && si_is_merged_shader(shader)) { /* Set EXEC = ~0 before the first shader. For monolithic shaders, the wrapper diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.h b/src/gallium/drivers/radeonsi/si_shader_llvm.h index 9daf3ae728f..46662564f77 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.h +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.h @@ -31,7 +31,6 @@ struct si_shader_context { struct ac_llvm_compiler *compiler; LLVMValueRef gs_ngg_emit; - struct ac_llvm_pointer gs_ngg_scratch; LLVMValueRef return_value; };