From 684943bd1ff5cee84bcd6ce19dd1a2edc99f0836 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 2 Jun 2025 14:53:02 +0100 Subject: [PATCH] aco/gfx6: allow vgpr offset for global access with sgpr address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No reason why we can't use offen like normal buffer loads. fossil-db (pitcairn): Totals from 122 (0.20% of 62069) affected shaders: MaxWaves: 521 -> 525 (+0.77%) Instrs: 238341 -> 237228 (-0.47%); split: -0.57%, +0.10% CodeSize: 1196260 -> 1188076 (-0.68%); split: -0.78%, +0.09% SGPRs: 8752 -> 8760 (+0.09%); split: -0.64%, +0.73% VGPRs: 10456 -> 10440 (-0.15%); split: -0.88%, +0.73% Latency: 3958385 -> 3946186 (-0.31%); split: -0.38%, +0.07% InvThroughput: 3097193 -> 3084417 (-0.41%); split: -0.42%, +0.01% VClause: 4058 -> 4500 (+10.89%); split: -0.02%, +10.92% SClause: 4511 -> 4500 (-0.24%); split: -0.42%, +0.18% Copies: 31228 -> 31718 (+1.57%); split: -0.38%, +1.95% PreSGPRs: 7211 -> 7461 (+3.47%) PreVGPRs: 8174 -> 8147 (-0.33%); split: -0.34%, +0.01% VALU: 174779 -> 173294 (-0.85%); split: -0.87%, +0.02% SALU: 29138 -> 29641 (+1.73%); split: -0.09%, +1.82% Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- .../aco_select_nir_intrinsics.cpp | 39 ++++++++++++++----- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp index 5466a6f097c..e40f81a6404 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp @@ -855,10 +855,11 @@ lower_global_address(Builder& bld, uint32_t offset_in, Temp* address_inout, } if (bld.program->gfx_level == GFX6) { - /* GFX6 (MUBUF): (SGPR address, SGPR offset) or (VGPR address, SGPR offset) */ + /* GFX6 (MUBUF): (SGPR address, SGPR offset) or (SGPR address, VGPR offset) */ + /* GFX6 (MUBUF-addr64): (VGPR address, SGPR offset) */ /* Disallow SGPR address with both a const_offset and offset because of possible overflow. */ - if (offset.id() && (offset.type() != RegType::sgpr || - (address.type() == RegType::sgpr && const_offset > 0))) { + if (offset.id() && + (address.type() == RegType::vgpr ? offset.type() != RegType::sgpr : const_offset > 0)) { address = add64_32(bld, address, Operand(offset)); offset = Temp(); } @@ -937,8 +938,14 @@ global_load_callback(Builder& bld, const LoadEmitInfo& info, unsigned bytes_need if (use_mubuf) { aco_ptr mubuf{create_instruction(op, Format::MUBUF, 3, 1)}; mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, addr)); - mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); - mubuf->operands[2] = Operand(offset); + if (addr.type() == RegType::vgpr) + mubuf->operands[1] = Operand(addr); + else if (offset.type() == RegType::vgpr) + mubuf->operands[1] = Operand(offset); + else + mubuf->operands[1] = Operand(v1); + mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0); + mubuf->mubuf().offen = offset.type() == RegType::vgpr; mubuf->mubuf().cache = info.cache; mubuf->mubuf().offset = const_offset; mubuf->mubuf().addr64 = addr.type() == RegType::vgpr; @@ -2564,10 +2571,16 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr) aco_ptr mubuf{create_instruction(op, Format::MUBUF, 4, 0)}; mubuf->operands[0] = Operand(rsrc); - mubuf->operands[1] = - write_address.type() == RegType::vgpr ? Operand(write_address) : Operand(v1); - mubuf->operands[2] = Operand(write_offset); + if (write_address.type() == RegType::vgpr) + mubuf->operands[1] = Operand(write_address); + else if (write_offset.type() == RegType::vgpr) + mubuf->operands[1] = Operand(write_offset); + else + mubuf->operands[1] = Operand(v1); + mubuf->operands[2] = + write_offset.type() == RegType::sgpr ? Operand(write_offset) : Operand::c32(0); mubuf->operands[3] = Operand(write_datas[i]); + mubuf->mubuf().offen = write_offset.type() == RegType::vgpr; mubuf->mubuf().cache = get_cache_flags(ctx, access); mubuf->mubuf().offset = write_const_offset; mubuf->mubuf().addr64 = write_address.type() == RegType::vgpr; @@ -2699,13 +2712,19 @@ visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr) aco_ptr mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)}; mubuf->operands[0] = Operand(rsrc); - mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); - mubuf->operands[2] = Operand(offset); + if (addr.type() == RegType::vgpr) + mubuf->operands[1] = Operand(addr); + else if (offset.type() == RegType::vgpr) + mubuf->operands[1] = Operand(offset); + else + mubuf->operands[1] = Operand(v1); + mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0); mubuf->operands[3] = Operand(data); Definition def = return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition(); if (return_previous) mubuf->definitions[0] = def; + mubuf->mubuf().offen = offset.type() == RegType::vgpr; mubuf->mubuf().cache = get_atomic_cache_flags(ctx, return_previous); mubuf->mubuf().offset = const_offset; mubuf->mubuf().addr64 = addr.type() == RegType::vgpr;