From 9c55b0ca205136e182ac7ce46bf5e91673fd010c Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 16 Jun 2025 16:28:41 +0100 Subject: [PATCH] aco: use MUBUF for global access with SGPR address on GFX7/8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This should be better than using FLAT, which only supports a VGPR address. fossil-db (polaris10): Totals from 159 (0.26% of 62070) affected shaders: MaxWaves: 789 -> 803 (+1.77%) Instrs: 234284 -> 230557 (-1.59%); split: -1.71%, +0.12% CodeSize: 1212324 -> 1186716 (-2.11%); split: -2.23%, +0.11% SGPRs: 10504 -> 10712 (+1.98%) VGPRs: 10556 -> 10236 (-3.03%); split: -3.37%, +0.34% SpillSGPRs: 579 -> 577 (-0.35%) Latency: 3903056 -> 3875625 (-0.70%); split: -0.87%, +0.16% InvThroughput: 3139443 -> 3114426 (-0.80%); split: -0.86%, +0.07% VClause: 4205 -> 4433 (+5.42%); split: -0.43%, +5.85% SClause: 4461 -> 4445 (-0.36%); split: -0.43%, +0.07% Copies: 30889 -> 31507 (+2.00%); split: -0.29%, +2.29% PreSGPRs: 7370 -> 7609 (+3.24%) PreVGPRs: 8339 -> 8193 (-1.75%) VALU: 175025 -> 170232 (-2.74%); split: -2.77%, +0.03% SALU: 27269 -> 28532 (+4.63%); split: -0.01%, +4.64% Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- .../aco_select_nir_intrinsics.cpp | 57 ++++++++++++------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp index dcbf4370923..9780581a3aa 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp @@ -806,7 +806,7 @@ const EmitLoadParameters scratch_mubuf_load_params{mubuf_load_callback, 4095}; const EmitLoadParameters scratch_flat_load_params{scratch_load_callback, 2047}; Temp -get_gfx6_global_rsrc(Builder& bld, Temp addr) +get_mubuf_global_rsrc(Builder& bld, Temp addr) { uint32_t desc[4]; ac_build_raw_buffer_descriptor(bld.program->gfx_level, 0, 0xffffffff, desc); @@ -818,6 +818,20 @@ get_gfx6_global_rsrc(Builder& bld, Temp addr) Operand::c32(desc[3])); } +Temp +add64_const64(Builder& bld, Temp addr, uint64_t offset) +{ + /* This could be more efficient if offset>UINT32_MAX by doing a full 64-bit addition, + * but that should be really rare. + */ + while (offset) { + uint32_t src2 = MIN2(offset, UINT32_MAX); + addr = add64_32(bld, addr, Operand::c32(src2)); + offset -= src2; + } + return addr; +} + Format lower_global_address(isel_context* ctx, Builder& bld, uint32_t offset_in, Temp* address_inout, uint32_t* const_offset_inout, Temp* offset_inout, nir_src* offset_src) @@ -829,7 +843,7 @@ lower_global_address(isel_context* ctx, Builder& bld, uint32_t offset_in, Temp* Format format = Format::MUBUF; if (bld.program->gfx_level >= GFX9) format = Format::GLOBAL; - else if (bld.program->gfx_level >= GFX7) + else if (bld.program->gfx_level >= GFX7 && address.type() == RegType::vgpr) format = Format::FLAT; uint64_t max_const_offset_plus_one = @@ -842,24 +856,15 @@ lower_global_address(isel_context* ctx, Builder& bld, uint32_t offset_in, Temp* const_offset %= max_const_offset_plus_one; if (!offset.id()) { - while (unlikely(excess_offset > UINT32_MAX)) { - address = add64_32(bld, address, Operand::c32(UINT32_MAX)); - excess_offset -= UINT32_MAX; - } - if (excess_offset) - offset = bld.copy(bld.def(s1), Operand::c32(excess_offset)); + address = add64_const64(bld, address, excess_offset / UINT32_MAX * UINT32_MAX); + if (excess_offset % UINT32_MAX) + offset = bld.copy(bld.def(s1), Operand::c32(excess_offset % UINT32_MAX)); } else { /* If we add to "offset", we would transform the indended * "address + u2u64(offset) + u2u64(const_offset)" into * "address + u2u64(offset + const_offset)", so add to the address. - * This could be more efficient if excess_offset>UINT32_MAX by doing a full 64-bit addition, - * but that should be really rare. */ - while (excess_offset) { - uint32_t src2 = MIN2(excess_offset, UINT32_MAX); - address = add64_32(bld, address, Operand::c32(src2)); - excess_offset -= src2; - } + address = add64_const64(bld, address, excess_offset); } if (format == Format::MUBUF) { @@ -869,8 +874,14 @@ lower_global_address(isel_context* ctx, Builder& bld, uint32_t offset_in, Temp* if (offset.id() && (address.type() == RegType::vgpr ? offset.type() != RegType::sgpr : add_might_overflow(ctx, offset_src, const_offset))) { - address = add64_32(bld, address, Operand(offset)); - offset = Temp(); + if (offset.type() == RegType::vgpr && bld.program->gfx_level > GFX6) { + assert(address.type() == RegType::sgpr); + address = add64_const64(bld, address, const_offset); + const_offset = 0; + } else { + address = add64_32(bld, address, Operand(offset)); + offset = Temp(); + } } offset = offset.id() ? offset : bld.copy(bld.def(s1), Operand::zero()); } else if (format == Format::FLAT) { @@ -950,8 +961,10 @@ global_load_callback(Builder& bld, const LoadEmitInfo& info, unsigned bytes_need RegClass rc = RegClass::get(RegType::vgpr, bytes_size); Temp val = rc == info.dst.regClass() ? info.dst : bld.tmp(rc); if (use_mubuf) { + assert(bld.program->gfx_level == GFX6 || addr.type() != RegType::vgpr); + aco_ptr mubuf{create_instruction(op, Format::MUBUF, 3, 1)}; - mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, addr)); + mubuf->operands[0] = Operand(get_mubuf_global_rsrc(bld, addr)); if (addr.type() == RegType::vgpr) mubuf->operands[1] = Operand(addr); else if (offset.type() == RegType::vgpr) @@ -2576,11 +2589,11 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr) ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(flat)); } else { - assert(ctx->options->gfx_level == GFX6); + assert(ctx->options->gfx_level == GFX6 || write_address.type() != RegType::vgpr); aco_opcode op = get_buffer_store_op(write_datas[i].bytes()); - Temp rsrc = get_gfx6_global_rsrc(bld, write_address); + Temp rsrc = get_mubuf_global_rsrc(bld, write_address); aco_ptr mubuf{create_instruction(op, Format::MUBUF, 4, 0)}; mubuf->operands[0] = Operand(rsrc); @@ -2713,12 +2726,12 @@ visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr) ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(flat)); } else { - assert(ctx->options->gfx_level == GFX6); + assert(ctx->options->gfx_level == GFX6 || addr.type() != RegType::vgpr); UNUSED aco_opcode image_op; translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op); - Temp rsrc = get_gfx6_global_rsrc(bld, addr); + Temp rsrc = get_mubuf_global_rsrc(bld, addr); aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;