diff --git a/src/amd/common/nir/ac_nir.h b/src/amd/common/nir/ac_nir.h index 47a30ab9d92..537eb2491eb 100644 --- a/src/amd/common/nir/ac_nir.h +++ b/src/amd/common/nir/ac_nir.h @@ -217,11 +217,11 @@ typedef struct { bool ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *options, - uint32_t *out_lds_vertex_size); + uint32_t *out_lds_vertex_size, uint8_t *out_lds_scratch_size); bool ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options, - uint32_t *out_lds_vertex_size); + uint32_t *out_lds_vertex_size, uint8_t *out_lds_scratch_size); bool ac_nir_lower_ngg_mesh(nir_shader *shader, diff --git a/src/amd/common/nir/ac_nir_lower_ngg.c b/src/amd/common/nir/ac_nir_lower_ngg.c index 6ebbeaeb942..a044bd7af29 100644 --- a/src/amd/common/nir/ac_nir_lower_ngg.c +++ b/src/amd/common/nir/ac_nir_lower_ngg.c @@ -74,6 +74,7 @@ typedef struct /* LDS params */ unsigned pervertex_lds_bytes; + unsigned lds_scratch_size; nir_variable *repacked_rel_patch_id; @@ -108,9 +109,9 @@ enum { }; static nir_def * -pervertex_lds_addr(nir_builder *b, nir_def *vertex_idx, unsigned per_vtx_bytes) +pervertex_lds_addr(nir_builder *b, lower_ngg_nogs_state *s, nir_def *vertex_idx, unsigned per_vtx_bytes) { - return nir_imul_imm(b, vertex_idx, per_vtx_bytes); + return nir_iadd_imm_nuw(b, nir_imul_imm(b, vertex_idx, per_vtx_bytes), s->lds_scratch_size); } static void @@ -206,7 +207,7 @@ emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *s, nir_def *arg) for (int i = 0; i < s->options->num_vertices_per_primitive; i++) { nir_def *vtx_idx = nir_load_var(b, s->gs_vtx_indices_vars[i]); - nir_def *addr = pervertex_lds_addr(b, vtx_idx, s->pervertex_lds_bytes); + nir_def *addr = pervertex_lds_addr(b, s, vtx_idx, s->pervertex_lds_bytes); /* Edge flags share LDS with XFB. */ nir_def *edge = ac_nir_load_shared_xfb(b, 32, addr, &s->out, VARYING_SLOT_EDGE, 0); @@ -261,7 +262,7 @@ emit_ngg_nogs_prim_id_store_shared(nir_builder *b, lower_ngg_nogs_state *s) b, gs_vtx_indices, s->options->num_vertices_per_primitive, provoking_vertex); nir_def *prim_id = nir_load_primitive_id(b); - nir_def *addr = pervertex_lds_addr(b, provoking_vtx_idx, s->pervertex_lds_bytes); + nir_def *addr = pervertex_lds_addr(b, s, provoking_vtx_idx, s->pervertex_lds_bytes); /* primitive id is always at last of a vertex */ nir_store_shared(b, prim_id, addr, .base = s->pervertex_lds_bytes - 4); @@ -301,7 +302,7 @@ emit_store_ngg_nogs_es_primitive_id(nir_builder *b, lower_ngg_nogs_state *s) /* LDS address where the primitive ID is stored */ nir_def *thread_id_in_threadgroup = nir_load_local_invocation_index(b); nir_def *addr = - pervertex_lds_addr(b, thread_id_in_threadgroup, s->pervertex_lds_bytes); + pervertex_lds_addr(b, s, thread_id_in_threadgroup, s->pervertex_lds_bytes); /* Load primitive ID from LDS */ prim_id = nir_load_shared(b, 1, 32, addr, .base = s->pervertex_lds_bytes - 4); @@ -478,7 +479,7 @@ compact_vertices_after_culling(nir_builder *b, { nir_if *if_es_accepted = nir_push_if(b, nir_load_var(b, s->es_accepted_var)); { - nir_def *exporter_addr = pervertex_lds_addr(b, es_exporter_tid, pervertex_lds_bytes); + nir_def *exporter_addr = pervertex_lds_addr(b, s, es_exporter_tid, pervertex_lds_bytes); /* Store the exporter thread's index to the LDS space of the current thread so GS threads can load it */ nir_store_shared(b, nir_u2u8(b, es_exporter_tid), es_vertex_lds_addr, .base = lds_es_exporter_tid); @@ -568,7 +569,7 @@ compact_vertices_after_culling(nir_builder *b, if_gs_accepted = nir_push_if(b, gs_accepted); { - nir_def *exporter_addr = pervertex_lds_addr(b, gs_exporter_tid, pervertex_lds_bytes); + nir_def *exporter_addr = pervertex_lds_addr(b, s, gs_exporter_tid, pervertex_lds_bytes); nir_def *prim_exp_arg = nir_load_var(b, s->prim_exp_arg_var); /* Store the primitive export argument into the address of the exporter thread. */ @@ -1100,8 +1101,6 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c remove_culling_shader_outputs(b->shader, s); b->cursor = nir_after_impl(impl); - nir_def *lds_scratch_base = nir_load_lds_ngg_scratch_base_amd(b); - /* Run culling algorithms if culling is enabled. * * NGG culling can be enabled or disabled in runtime. @@ -1112,7 +1111,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c nir_if *if_cull_en = nir_push_if(b, nir_load_cull_any_enabled_amd(b)); { nir_def *invocation_index = nir_load_local_invocation_index(b); - nir_def *es_vertex_lds_addr = pervertex_lds_addr(b, invocation_index, pervertex_lds_bytes); + nir_def *es_vertex_lds_addr = pervertex_lds_addr(b, s, invocation_index, pervertex_lds_bytes); /* ES invocations store their vertex data to LDS for GS threads to read. */ if_es_thread = nir_push_if(b, es_thread); @@ -1153,7 +1152,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c /* Load W positions of vertices first because the culling code will use these first */ for (unsigned vtx = 0; vtx < s->options->num_vertices_per_primitive; ++vtx) { - s->vtx_addr[vtx] = pervertex_lds_addr(b, vtx_idx[vtx], pervertex_lds_bytes); + s->vtx_addr[vtx] = pervertex_lds_addr(b, s, vtx_idx[vtx], pervertex_lds_bytes); pos[vtx][3] = nir_load_shared(b, 1, 32, s->vtx_addr[vtx], .base = lds_es_pos_w); nir_store_var(b, gs_vtxaddr_vars[vtx], s->vtx_addr[vtx], 0x1u); } @@ -1209,7 +1208,7 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c nir_def *accepted[] = { es_accepted, gs_accepted }; ac_nir_wg_repack_result rep[2] = {0}; const unsigned num_rep = s->options->compact_primitives ? 2 : 1; - ac_nir_repack_invocations_in_workgroup(b, accepted, rep, num_rep, lds_scratch_base, + ac_nir_repack_invocations_in_workgroup(b, accepted, rep, num_rep, nir_imm_int(b, 0), s->max_num_waves, s->options->wave_size); nir_def *num_live_vertices_in_workgroup = rep[0].num_repacked_invocations; nir_def *es_exporter_tid = rep[0].repacked_invocation_index; @@ -1299,7 +1298,7 @@ ngg_nogs_store_edgeflag_to_lds(nir_builder *b, lower_ngg_nogs_state *s) edgeflag = nir_umin(b, edgeflag, nir_imm_int(b, 1)); nir_def *tid = nir_load_local_invocation_index(b); - nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes); + nir_def *addr = pervertex_lds_addr(b, s, tid, s->pervertex_lds_bytes); /* Edge flags share LDS with XFB. */ ac_nir_store_shared_xfb(b, edgeflag, addr, &s->out, VARYING_SLOT_EDGE, 0); @@ -1335,7 +1334,7 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s) } nir_def *tid = nir_load_local_invocation_index(b); - nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes); + nir_def *addr = pervertex_lds_addr(b, s, tid, s->pervertex_lds_bytes); u_foreach_bit64(slot, xfb_outputs) { u_foreach_bit(c, xfb_mask[slot]) { @@ -1377,8 +1376,6 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s) { nir_xfb_info *info = ac_nir_get_sorted_xfb_info(b->shader); - nir_def *lds_scratch_base = nir_load_lds_ngg_scratch_base_amd(b); - /* Get global buffer offset where this workgroup will stream out data to. */ nir_def *generated_prim = nir_load_workgroup_num_input_primitives_amd(b); nir_def *gen_prim_per_stream[4] = {generated_prim, 0, 0, 0}; @@ -1387,7 +1384,7 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s) nir_def *so_buffer[4] = {0}; nir_def *tid_in_tg = nir_load_local_invocation_index(b); ac_nir_ngg_build_streamout_buffer_info(b, info, s->options->hw_info->gfx_level, s->options->has_xfb_prim_query, - s->options->use_gfx12_xfb_intrinsic, lds_scratch_base, tid_in_tg, + s->options->use_gfx12_xfb_intrinsic, nir_imm_int(b, 0), tid_in_tg, gen_prim_per_stream, so_buffer, buffer_offsets, emit_prim_per_stream); @@ -1409,7 +1406,7 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s) nir_push_if(b, nir_igt_imm(b, num_vert_per_prim, i)); { nir_def *vtx_lds_idx = nir_load_var(b, s->gs_vtx_indices_vars[i]); - nir_def *vtx_lds_addr = pervertex_lds_addr(b, vtx_lds_idx, s->pervertex_lds_bytes); + nir_def *vtx_lds_addr = pervertex_lds_addr(b, s, vtx_lds_idx, s->pervertex_lds_bytes); ac_nir_ngg_build_streamout_vertex(b, info, 0, so_buffer, buffer_offsets, i, vtx_lds_addr, &s->out); } @@ -1507,7 +1504,7 @@ ac_ngg_nogs_get_pervertex_lds_size(lower_ngg_nogs_state *s, bool ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *options, - uint32_t *out_lds_vertex_size) + uint32_t *out_lds_vertex_size, uint8_t *out_lds_scratch_size) { nir_function_impl *impl = nir_shader_get_entrypoint(shader); assert(impl); @@ -1553,6 +1550,9 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option .gs_exported_var = gs_exported_var, .max_num_waves = DIV_ROUND_UP(options->max_workgroup_size, options->wave_size), .has_user_edgeflags = has_user_edgeflags, + .lds_scratch_size = ac_ngg_get_scratch_lds_size(shader->info.stage, options->max_workgroup_size, + options->wave_size, streamout_enabled, + options->can_cull, options->compact_primitives), }; /* Can't export the primitive ID both as per-vertex and per-primitive. */ @@ -1823,6 +1823,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option options->export_primitive_id, state.has_user_edgeflags, options->can_cull, state.deferred.uses_instance_id, state.deferred.uses_tess_primitive_id); + *out_lds_scratch_size = state.lds_scratch_size; return true; } diff --git a/src/amd/common/nir/ac_nir_lower_ngg_gs.c b/src/amd/common/nir/ac_nir_lower_ngg_gs.c index 1e7db649cd8..b5889e00a2d 100644 --- a/src/amd/common/nir/ac_nir_lower_ngg_gs.c +++ b/src/amd/common/nir/ac_nir_lower_ngg_gs.c @@ -23,9 +23,9 @@ typedef struct unsigned max_num_waves; unsigned num_vertices_per_primitive; nir_def *lds_addr_gs_out_vtx; - nir_def *lds_addr_gs_scratch; unsigned lds_bytes_per_gs_out_vertex; unsigned lds_offs_primflags; + unsigned lds_scratch_size; bool output_compile_time_known; bool streamout_enabled; /* Outputs */ @@ -79,7 +79,7 @@ ngg_gs_out_vertex_addr(nir_builder *b, nir_def *out_vtx_idx, lower_ngg_gs_state } nir_def *out_vtx_offs = nir_imul_imm(b, out_vtx_idx, s->lds_bytes_per_gs_out_vertex); - return nir_iadd_nuw(b, out_vtx_offs, s->lds_addr_gs_out_vtx); + return nir_iadd_nuw(b, out_vtx_offs, nir_iadd_imm_nuw(b, s->lds_addr_gs_out_vtx, s->lds_scratch_size)); } static nir_def * @@ -739,7 +739,7 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s) unsigned scratch_stride = ALIGN(s->max_num_waves, 4); nir_def *scratch_base = - nir_iadd_imm(b, s->lds_addr_gs_scratch, stream * scratch_stride); + nir_iadd_imm(b, s->lds_addr_gs_out_vtx, stream * scratch_stride); /* We want to export primitives to streamout buffer in sequence, * but not all vertices are alive or mark end of a primitive, so @@ -776,7 +776,7 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s) nir_def *buffer_offsets[4] = {0}; nir_def *so_buffer[4] = {0}; ac_nir_ngg_build_streamout_buffer_info(b, info, s->options->hw_info->gfx_level, s->options->has_xfb_prim_query, - s->options->use_gfx12_xfb_intrinsic, s->lds_addr_gs_scratch, tid_in_tg, + s->options->use_gfx12_xfb_intrinsic, s->lds_addr_gs_out_vtx, tid_in_tg, gen_prim, so_buffer, buffer_offsets, emit_prim); for (unsigned stream = 0; stream < 4; stream++) { @@ -871,7 +871,7 @@ ngg_gs_finale(nir_builder *b, lower_ngg_gs_state *s) nir_def *vertex_live = nir_ine_imm(b, out_vtx_primflag_0, 0); ac_nir_wg_repack_result rep = {0}; - ac_nir_repack_invocations_in_workgroup(b, &vertex_live, &rep, 1, s->lds_addr_gs_scratch, + ac_nir_repack_invocations_in_workgroup(b, &vertex_live, &rep, 1, s->lds_addr_gs_out_vtx, s->max_num_waves, s->options->wave_size); nir_def *workgroup_num_vertices = rep.num_repacked_invocations; @@ -900,16 +900,20 @@ ngg_gs_finale(nir_builder *b, lower_ngg_gs_state *s) bool ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options, - uint32_t *out_lds_vertex_size) + uint32_t *out_lds_vertex_size, uint8_t *out_lds_scratch_size) { nir_function_impl *impl = nir_shader_get_entrypoint(shader); assert(impl); + bool streamout_enabled = shader->xfb_info && !options->disable_streamout; lower_ngg_gs_state state = { .options = options, .impl = impl, .max_num_waves = DIV_ROUND_UP(options->max_workgroup_size, options->wave_size), - .streamout_enabled = shader->xfb_info && !options->disable_streamout, + .streamout_enabled = streamout_enabled, + .lds_scratch_size = ac_ngg_get_scratch_lds_size(shader->info.stage, options->max_workgroup_size, + options->wave_size, streamout_enabled, + options->can_cull, options->compact_primitives), }; if (!options->can_cull) { @@ -940,7 +944,6 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options, .memory_semantics=NIR_MEMORY_ACQ_REL, .memory_modes=nir_var_mem_shared); state.lds_addr_gs_out_vtx = nir_load_lds_ngg_gs_out_vertex_base_amd(b); - state.lds_addr_gs_scratch = nir_load_lds_ngg_scratch_base_amd(b); /* Wrap the GS control flow. */ nir_if *if_gs_thread = nir_push_if(b, nir_is_subgroup_invocation_lt_amd(b, nir_load_merged_wave_info_amd(b), .base = 8)); @@ -998,5 +1001,6 @@ ac_nir_lower_ngg_gs(nir_shader *shader, const ac_nir_lower_ngg_options *options, nir_remove_dead_variables(shader, nir_var_function_temp, NULL); *out_lds_vertex_size = state.lds_bytes_per_gs_out_vertex; + *out_lds_scratch_size = state.lds_scratch_size; return nir_progress(true, impl, nir_metadata_none); } diff --git a/src/amd/compiler/aco_shader_info.h b/src/amd/compiler/aco_shader_info.h index eb97834738e..0f93944a56f 100644 --- a/src/amd/compiler/aco_shader_info.h +++ b/src/amd/compiler/aco_shader_info.h @@ -188,7 +188,6 @@ enum aco_symbol_id { aco_symbol_invalid, aco_symbol_scratch_addr_lo, aco_symbol_scratch_addr_hi, - aco_symbol_lds_ngg_scratch_base, aco_symbol_lds_ngg_gs_out_vertex_base, aco_symbol_const_data_addr, }; diff --git a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp index 2f3ef3270e6..93431691b96 100644 --- a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp +++ b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp @@ -540,7 +540,6 @@ init_context(isel_context* ctx, nir_shader* shader) case nir_intrinsic_ballot_relaxed: case nir_intrinsic_bindless_image_samples: case nir_intrinsic_load_scalar_arg_amd: - case nir_intrinsic_load_lds_ngg_scratch_base_amd: case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: case nir_intrinsic_load_smem_amd: case nir_intrinsic_unit_test_uniform_amd: type = RegType::sgpr; break; diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp index 8b40ac210b0..660b16c34e2 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp @@ -4932,12 +4932,6 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) ctx->block->instructions.emplace_back(std::move(vec)); break; } - case nir_intrinsic_load_lds_ngg_scratch_base_amd: { - Temp dst = get_ssa_temp(ctx, &instr->def); - bld.sop1(aco_opcode::p_load_symbol, Definition(dst), - Operand::c32(aco_symbol_lds_ngg_scratch_base)); - break; - } case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: { Temp dst = get_ssa_temp(ctx, &instr->def); bld.sop1(aco_opcode::p_load_symbol, Definition(dst), diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index 0233b672872..22f5940b879 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -2768,7 +2768,6 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins result = ac_build_gather_values(&ctx->ac, values, 3); break; } - case nir_intrinsic_load_lds_ngg_scratch_base_amd: case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: result = ctx->abi->intrinsic_load(ctx->abi, instr); break; diff --git a/src/amd/vulkan/nir/radv_nir_lower_abi.c b/src/amd/vulkan/nir/radv_nir_lower_abi.c index aba35129b4d..23cf8e75752 100644 --- a/src/amd/vulkan/nir/radv_nir_lower_abi.c +++ b/src/amd/vulkan/nir/radv_nir_lower_abi.c @@ -370,9 +370,6 @@ lower_abi_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state) case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: replacement = GET_SGPR_FIELD_NIR(s->args->ngg_lds_layout, NGG_LDS_LAYOUT_GS_OUT_VERTEX_BASE); break; - case nir_intrinsic_load_lds_ngg_scratch_base_amd: - replacement = GET_SGPR_FIELD_NIR(s->args->ngg_lds_layout, NGG_LDS_LAYOUT_SCRATCH_BASE); - break; case nir_intrinsic_load_num_vertices_per_primitive_amd: { unsigned num_vertices; diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index a9569549842..3f816125467 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -2269,12 +2269,10 @@ radv_emit_hw_ngg(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *e const uint32_t ngg_lds_layout_offset = radv_get_user_sgpr_loc(shader, AC_UD_NGG_LDS_LAYOUT); assert(ngg_lds_layout_offset); - assert(!(shader->info.ngg_info.esgs_ring_size & 0xffff0000) && - !(shader->info.ngg_info.scratch_lds_base & 0xffff0000)); + assert(!(shader->info.ngg_info.esgs_ring_size & 0xffff0000)); radeon_set_sh_reg(ngg_lds_layout_offset, - SET_SGPR_FIELD(NGG_LDS_LAYOUT_GS_OUT_VERTEX_BASE, shader->info.ngg_info.esgs_ring_size) | - SET_SGPR_FIELD(NGG_LDS_LAYOUT_SCRATCH_BASE, shader->info.ngg_info.scratch_lds_base)); + SET_SGPR_FIELD(NGG_LDS_LAYOUT_GS_OUT_VERTEX_BASE, shader->info.ngg_info.esgs_ring_size)); radeon_end(); } diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 059d8795e7b..f11594cd8e0 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -275,15 +275,8 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, const struct radv_nir declare_esgs_ring(&ctx); if (ctx.stage == MESA_SHADER_GEOMETRY) { - /* Scratch space used by NGG GS for repacking vertices at the end. */ - LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8); - LLVMValueRef gs_ngg_scratch = - LLVMAddGlobalInAddressSpace(ctx.ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS); - LLVMSetInitializer(gs_ngg_scratch, LLVMGetUndef(ai32)); - LLVMSetLinkage(gs_ngg_scratch, LLVMExternalLinkage); - LLVMSetAlignment(gs_ngg_scratch, 4); - /* Vertex emit space used by NGG GS for storing all vertex attributes. */ + LLVMTypeRef ai32 = LLVMArrayType(ctx.ac.i32, 8); LLVMValueRef gs_ngg_emit = LLVMAddGlobalInAddressSpace(ctx.ac.module, LLVMArrayType(ctx.ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS); LLVMSetInitializer(gs_ngg_emit, LLVMGetUndef(ai32)); diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 071af4c9c44..b8ec1d59009 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -815,11 +815,13 @@ radv_lower_ngg(struct radv_device *device, struct radv_shader_stage *ngg_stage, options.export_primitive_id_per_prim = info->outinfo.export_prim_id_per_primitive; options.instance_rate_inputs = gfx_state->vi.instance_rate_inputs << VERT_ATTRIB_GENERIC0; - NIR_PASS(_, nir, ac_nir_lower_ngg_nogs, &options, &ngg_stage->info.ngg_lds_vertex_size); + NIR_PASS(_, nir, ac_nir_lower_ngg_nogs, &options, &ngg_stage->info.ngg_lds_vertex_size, + &ngg_stage->info.ngg_lds_scratch_size); } else if (nir->info.stage == MESA_SHADER_GEOMETRY) { assert(info->is_ngg); - NIR_PASS(_, nir, ac_nir_lower_ngg_gs, &options, &ngg_stage->info.ngg_lds_vertex_size); + NIR_PASS(_, nir, ac_nir_lower_ngg_gs, &options, &ngg_stage->info.ngg_lds_vertex_size, + &ngg_stage->info.ngg_lds_scratch_size); } else if (nir->info.stage == MESA_SHADER_MESH) { /* ACO aligns the workgroup size to the wave size. */ unsigned hw_workgroup_size = ALIGN(info->workgroup_size, info->wave_size); @@ -1407,11 +1409,6 @@ radv_open_rtld_binary(struct radv_device *device, const struct radv_shader_binar sym->name = "ngg_emit"; sym->size = binary->info.ngg_info.ngg_emit_size * 4; sym->align = 4; - - sym = &lds_symbols[num_lds_symbols++]; - sym->name = "ngg_scratch"; - sym->size = 8; - sym->align = 4; } struct ac_rtld_open_info open_info = { diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index d74bb9d60b7..2c3ee50246f 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -226,8 +226,6 @@ struct radv_nir_compiler_options { #define NGG_LDS_LAYOUT_GS_OUT_VERTEX_BASE__SHIFT 0 #define NGG_LDS_LAYOUT_GS_OUT_VERTEX_BASE__MASK 0xffff -#define NGG_LDS_LAYOUT_SCRATCH_BASE__SHIFT 16 -#define NGG_LDS_LAYOUT_SCRATCH_BASE__MASK 0xffff #define NGG_STATE_NUM_VERTS_PER_PRIM__SHIFT 0 #define NGG_STATE_NUM_VERTS_PER_PRIM__MASK 0x7 diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index f6a150a3e4b..8e69c66844d 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -1421,24 +1421,20 @@ radv_get_pre_rast_input_topology(const struct radv_shader_info *es_info, const s } static unsigned -gfx10_get_ngg_scratch_lds_base(const struct radv_device *device, const struct radv_shader_info *es_info, - const struct radv_shader_info *gs_info, const struct gfx10_ngg_info *ngg_info) +gfx10_get_ngg_vert_prim_lds_size(const struct radv_device *device, const struct radv_shader_info *es_info, + const struct radv_shader_info *gs_info, const struct gfx10_ngg_info *ngg_info) { - uint32_t scratch_lds_base; - if (gs_info) { const unsigned esgs_ring_lds_bytes = ngg_info->esgs_ring_size; const unsigned gs_total_out_vtx_bytes = ngg_info->ngg_emit_size * 4u; - scratch_lds_base = ALIGN(esgs_ring_lds_bytes + gs_total_out_vtx_bytes, 8u /* for the repacking code */); + return esgs_ring_lds_bytes + gs_total_out_vtx_bytes; } else { assert(ngg_info->hw_max_esverts <= 256); unsigned total_es_lds_bytes = es_info->ngg_lds_vertex_size * ngg_info->hw_max_esverts; - scratch_lds_base = ALIGN(total_es_lds_bytes, 8u); + return total_es_lds_bytes; } - - return scratch_lds_base; } void @@ -1648,13 +1644,8 @@ gfx10_get_ngg_info(const struct radv_device *device, struct radv_shader_info *es assert(out->hw_max_esverts >= min_esverts); /* HW limitation */ - out->scratch_lds_base = gfx10_get_ngg_scratch_lds_base(device, es_info, gs_info, out); - - /* Get scratch LDS usage. */ const struct radv_shader_info *info = gs_info ? gs_info : es_info; - const unsigned scratch_lds_size = ac_ngg_get_scratch_lds_size(info->stage, info->workgroup_size, info->wave_size, - pdev->use_ngg_streamout, info->has_ngg_culling, false); - out->lds_size = out->scratch_lds_base + scratch_lds_size; + out->lds_size = info->ngg_lds_scratch_size + gfx10_get_ngg_vert_prim_lds_size(device, es_info, gs_info, out); unsigned workgroup_size = ac_compute_ngg_workgroup_size(max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor); diff --git a/src/amd/vulkan/radv_shader_info.h b/src/amd/vulkan/radv_shader_info.h index f72ea8af4af..2a4996f14f8 100644 --- a/src/amd/vulkan/radv_shader_info.h +++ b/src/amd/vulkan/radv_shader_info.h @@ -75,7 +75,6 @@ struct gfx10_ngg_info { uint32_t prim_amp_factor; uint32_t vgt_esgs_ring_itemsize; uint32_t esgs_ring_size; - uint32_t scratch_lds_base; uint32_t lds_size; bool max_vert_out_per_gs_instance; }; @@ -98,6 +97,7 @@ struct radv_shader_info { bool has_ngg_early_prim_export; bool has_prim_query; bool has_xfb_query; + uint8_t ngg_lds_scratch_size; uint32_t num_tess_patches; uint32_t esgs_itemsize; /* Only for VS or TES as ES */ uint32_t ngg_lds_vertex_size; /* VS,TES: Cull+XFB, GS: GSVS size */ diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 593f38e9a8f..dc5800e5daa 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -321,7 +321,6 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_load_ordered_id_amd: case nir_intrinsic_load_gs_wave_id_amd: case nir_intrinsic_load_provoking_vtx_in_prim_amd: - case nir_intrinsic_load_lds_ngg_scratch_base_amd: case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: case nir_intrinsic_load_btd_shader_type_intel: case nir_intrinsic_load_base_global_invocation_id: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 1f9cb1c6915..700dc166896 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -2000,8 +2000,6 @@ intrinsic("atomic_add_xfb_prim_count_amd", [1], indices=[STREAM_ID]) # src[] = { invocation_count }. intrinsic("atomic_add_shader_invocation_count_amd", [1]) -# LDS offset for scratch section in NGG shader -system_value("lds_ngg_scratch_base_amd", 1) # LDS offset for NGG GS shader vertex emit system_value("lds_ngg_gs_out_vertex_base_amd", 1) diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 308ebb5f697..22998c2594d 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -34,18 +34,6 @@ static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse); } -unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader) -{ - const struct si_shader_selector *sel = shader->selector; - - return ac_ngg_get_scratch_lds_size(sel->stage, - si_get_max_workgroup_size(shader), - shader->wave_size, - shader->info.num_streamout_vec4s != 0, - si_shader_culling_enabled(shader), - false) / 4; -} - /** * Determine subgroup information like maximum number of vertices and prims. * @@ -65,9 +53,8 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader) const unsigned min_verts_per_prim = gs_stage == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1; /* All these are in dwords. The maximum is 16K dwords (64KB) of LDS per workgroup. */ - const unsigned scratch_lds_size = gfx10_ngg_get_scratch_dw_size(shader); - /* Scratch is at last of LDS space and 2 dwords aligned, so it may cost more for alignment. */ - const unsigned max_lds_size = 16 * 1024 - ALIGN(scratch_lds_size, 2); + /* The LDS scratch is at the beginning of LDS space. */ + const unsigned max_lds_size = 16 * 1024 - shader->info.ngg_lds_scratch_size / 4; const unsigned target_lds_size = max_lds_size; unsigned esvert_lds_size = 0; unsigned gsprim_lds_size = 0; @@ -201,6 +188,11 @@ retry_select_mode: esvert_lds_size; shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size; + if (gs_stage == MESA_SHADER_GEOMETRY) + shader->ngg.ngg_emit_size += shader->info.ngg_lds_scratch_size / 4; + else + shader->gs_info.esgs_ring_size += shader->info.ngg_lds_scratch_size / 4; + assert(shader->ngg.hw_max_esverts >= min_esverts); /* HW limitation */ /* If asserts are disabled, we use the same conditions to return false */ diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 972eceaba6f..108520f492e 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -459,15 +459,6 @@ static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shade if (stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) size_in_dw += shader->ngg.ngg_emit_size; - if (shader->key.ge.as_ngg) { - unsigned scratch_dw_size = gfx10_ngg_get_scratch_dw_size(shader); - if (scratch_dw_size) { - /* scratch base address needs to be 8 byte aligned */ - size_in_dw = ALIGN(size_in_dw, 2); - size_in_dw += scratch_dw_size; - } - } - shader->config.lds_size = DIV_ROUND_UP(size_in_dw * 4, get_lds_granularity(sscreen, stage)); } @@ -1172,7 +1163,8 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir, options.instance_rate_inputs = instance_rate_inputs; options.cull_clipdist_mask = clip_plane_enable; - NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options, &shader->info.ngg_lds_vertex_size); + NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options, &shader->info.ngg_lds_vertex_size, + &shader->info.ngg_lds_scratch_size); } else { assert(nir->info.stage == MESA_SHADER_GEOMETRY); @@ -1185,7 +1177,8 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir, if (key->ge.part.gs.es) nir->info.writes_memory |= key->ge.part.gs.es->info.base.writes_memory; - NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options, &shader->info.ngg_lds_vertex_size); + NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options, &shader->info.ngg_lds_vertex_size, + &shader->info.ngg_lds_scratch_size); } /* may generate some vector output store */ diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 76b2e18d788..be3baa3d1bd 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -757,7 +757,7 @@ struct gfx9_gs_info { unsigned gs_prims_per_subgroup; unsigned gs_inst_prims_in_subgroup; unsigned max_prims_per_subgroup; - unsigned esgs_ring_size; /* in bytes */ + unsigned esgs_ring_size; /* in dwords */ }; struct si_shader { diff --git a/src/gallium/drivers/radeonsi/si_shader_aco.c b/src/gallium/drivers/radeonsi/si_shader_aco.c index 7f13b322a8d..0ce57cf3e7a 100644 --- a/src/gallium/drivers/radeonsi/si_shader_aco.c +++ b/src/gallium/drivers/radeonsi/si_shader_aco.c @@ -207,13 +207,6 @@ si_aco_resolve_symbols(struct si_shader *shader, uint32_t *code_for_write, else value |= S_008F04_SWIZZLE_ENABLE_GFX6(1); break; - case aco_symbol_lds_ngg_scratch_base: - assert(sel->stage <= MESA_SHADER_GEOMETRY && key->ge.as_ngg); - value = shader->gs_info.esgs_ring_size * 4; - if (sel->stage == MESA_SHADER_GEOMETRY) - value += shader->ngg.ngg_emit_size * 4; - value = ALIGN(value, 8); - break; case aco_symbol_lds_ngg_gs_out_vertex_base: assert(sel->stage == MESA_SHADER_GEOMETRY && key->ge.as_ngg); value = shader->gs_info.esgs_ring_size * 4; diff --git a/src/gallium/drivers/radeonsi/si_shader_info.h b/src/gallium/drivers/radeonsi/si_shader_info.h index 47d84f5e378..1563f22b696 100644 --- a/src/gallium/drivers/radeonsi/si_shader_info.h +++ b/src/gallium/drivers/radeonsi/si_shader_info.h @@ -237,6 +237,7 @@ struct si_shader_variant_info { uint8_t nr_pos_exports; uint8_t nr_param_exports; uint8_t num_streamout_vec4s; + uint8_t ngg_lds_scratch_size; unsigned private_mem_vgprs; unsigned max_simd_waves; uint32_t ngg_lds_vertex_size; /* VS,TES: Cull+XFB, GS: GSVS size */ diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 876d2d39cab..7da9019999f 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -108,7 +108,6 @@ void si_get_ps_epilog_args(struct si_shader_args *args, /* gfx10_shader_ngg.c */ bool gfx10_ngg_export_prim_early(struct si_shader *shader); -unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader); bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader); struct nir_def; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 64dbc1b1d51..500eb368f5e 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -447,9 +447,6 @@ static LLVMValueRef si_llvm_load_intrinsic(struct ac_shader_abi *abi, nir_intrin struct si_shader_context *ctx = si_shader_context_from_abi(abi); switch (intrin->intrinsic) { - case nir_intrinsic_load_lds_ngg_scratch_base_amd: - return LLVMBuildPtrToInt(ctx->ac.builder, ctx->gs_ngg_scratch.value, ctx->ac.i32, ""); - case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: return LLVMBuildPtrToInt(ctx->ac.builder, ctx->gs_ngg_emit, ctx->ac.i32, ""); @@ -528,14 +525,6 @@ static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shade case MESA_SHADER_GEOMETRY: if (ctx->shader->key.ge.as_ngg) { - LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader)); - ctx->gs_ngg_scratch = (struct ac_llvm_pointer) { - .value = LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS), - .pointee_type = ai32 - }; - LLVMSetInitializer(ctx->gs_ngg_scratch.value, LLVMGetUndef(ai32)); - LLVMSetAlignment(ctx->gs_ngg_scratch.value, 8); - ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace( ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS); LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage); @@ -577,21 +566,6 @@ static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shade if (is_merged_esgs_stage || is_nogs_ngg_stage) si_llvm_declare_lds_esgs_ring(ctx); - /* This is really only needed when streamout and / or vertex - * compaction is enabled. - */ - if (is_nogs_ngg_stage && - (shader->info.num_streamout_vec4s || si_shader_culling_enabled(shader))) { - LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader)); - ctx->gs_ngg_scratch = (struct ac_llvm_pointer) { - .value = LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", - AC_ADDR_SPACE_LDS), - .pointee_type = asi32 - }; - LLVMSetInitializer(ctx->gs_ngg_scratch.value, LLVMGetUndef(asi32)); - LLVMSetAlignment(ctx->gs_ngg_scratch.value, 8); - } - /* For merged shaders (VS-TCS, VS-GS, TES-GS): */ if (ctx->screen->info.gfx_level >= GFX9 && si_is_merged_shader(shader)) { /* Set EXEC = ~0 before the first shader. For monolithic shaders, the wrapper diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.h b/src/gallium/drivers/radeonsi/si_shader_llvm.h index 9daf3ae728f..46662564f77 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.h +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.h @@ -31,7 +31,6 @@ struct si_shader_context { struct ac_llvm_compiler *compiler; LLVMValueRef gs_ngg_emit; - struct ac_llvm_pointer gs_ngg_scratch; LLVMValueRef return_value; };