diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 75b9fff0159..5d5f7e429b4 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -11409,7 +11409,7 @@ std::pair ngg_gs_workgroup_reduce_and_scan(isel_context *ctx, Temp s /* The first lane of each wave loads every wave's results from LDS, to avoid bank conflicts */ Temp reduction_per_wave_vector = load_lds(ctx, 4u * num_lds_dwords, bld.tmp(RegClass(RegType::vgpr, num_lds_dwords)), - bld.copy(bld.def(v1), Operand(0u)), ctx->ngg_gs_scratch_addr, 4u); + bld.copy(bld.def(v1), Operand(0u)), ctx->ngg_gs_scratch_addr, 16u); begin_divergent_if_else(ctx, &ic); end_divergent_if(ctx, &ic); diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index bbdbd800e7a..29b3845f02e 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -419,8 +419,9 @@ void setup_gs_variables(isel_context *ctx, nir_shader *nir) ctx->ngg_gs_emit_vtx_bytes = ctx->ngg_gs_primflags_offset + 4u; ctx->ngg_gs_emit_addr = esgs_ring_bytes; ctx->ngg_gs_scratch_addr = ctx->ngg_gs_emit_addr + ngg_emit_bytes; + ctx->ngg_gs_scratch_addr = ALIGN(ctx->ngg_gs_scratch_addr, 16u); - unsigned total_lds_bytes = esgs_ring_bytes + ngg_emit_bytes + ngg_gs_scratch_bytes; + unsigned total_lds_bytes = ctx->ngg_gs_scratch_addr + ngg_gs_scratch_bytes; assert(total_lds_bytes >= ctx->ngg_gs_emit_addr); assert(total_lds_bytes >= ctx->ngg_gs_scratch_addr); ctx->program->config->lds_size = DIV_ROUND_UP(total_lds_bytes, ctx->program->dev.lds_encoding_granule);