aco: use MUBUF for global access with SGPR address on GFX7/8

This should be better than using FLAT, which only supports a VGPR address.

fossil-db (polaris10):
Totals from 159 (0.26% of 62070) affected shaders:
MaxWaves: 789 -> 803 (+1.77%)
Instrs: 234284 -> 230557 (-1.59%); split: -1.71%, +0.12%
CodeSize: 1212324 -> 1186716 (-2.11%); split: -2.23%, +0.11%
SGPRs: 10504 -> 10712 (+1.98%)
VGPRs: 10556 -> 10236 (-3.03%); split: -3.37%, +0.34%
SpillSGPRs: 579 -> 577 (-0.35%)
Latency: 3903056 -> 3875625 (-0.70%); split: -0.87%, +0.16%
InvThroughput: 3139443 -> 3114426 (-0.80%); split: -0.86%, +0.07%
VClause: 4205 -> 4433 (+5.42%); split: -0.43%, +5.85%
SClause: 4461 -> 4445 (-0.36%); split: -0.43%, +0.07%
Copies: 30889 -> 31507 (+2.00%); split: -0.29%, +2.29%
PreSGPRs: 7370 -> 7609 (+3.24%)
PreVGPRs: 8339 -> 8193 (-1.75%)
VALU: 175025 -> 170232 (-2.74%); split: -2.77%, +0.03%
SALU: 27269 -> 28532 (+4.63%); split: -0.01%, +4.64%

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35465>
This commit is contained in:
Rhys Perry
2025-06-16 16:28:41 +01:00
committed by Marge Bot
parent 0094e6c32a
commit 9c55b0ca20
@@ -806,7 +806,7 @@ const EmitLoadParameters scratch_mubuf_load_params{mubuf_load_callback, 4095};
const EmitLoadParameters scratch_flat_load_params{scratch_load_callback, 2047};
Temp
get_gfx6_global_rsrc(Builder& bld, Temp addr)
get_mubuf_global_rsrc(Builder& bld, Temp addr)
{
uint32_t desc[4];
ac_build_raw_buffer_descriptor(bld.program->gfx_level, 0, 0xffffffff, desc);
@@ -818,6 +818,20 @@ get_gfx6_global_rsrc(Builder& bld, Temp addr)
Operand::c32(desc[3]));
}
Temp
add64_const64(Builder& bld, Temp addr, uint64_t offset)
{
/* This could be more efficient if offset>UINT32_MAX by doing a full 64-bit addition,
* but that should be really rare.
*/
while (offset) {
uint32_t src2 = MIN2(offset, UINT32_MAX);
addr = add64_32(bld, addr, Operand::c32(src2));
offset -= src2;
}
return addr;
}
Format
lower_global_address(isel_context* ctx, Builder& bld, uint32_t offset_in, Temp* address_inout,
uint32_t* const_offset_inout, Temp* offset_inout, nir_src* offset_src)
@@ -829,7 +843,7 @@ lower_global_address(isel_context* ctx, Builder& bld, uint32_t offset_in, Temp*
Format format = Format::MUBUF;
if (bld.program->gfx_level >= GFX9)
format = Format::GLOBAL;
else if (bld.program->gfx_level >= GFX7)
else if (bld.program->gfx_level >= GFX7 && address.type() == RegType::vgpr)
format = Format::FLAT;
uint64_t max_const_offset_plus_one =
@@ -842,24 +856,15 @@ lower_global_address(isel_context* ctx, Builder& bld, uint32_t offset_in, Temp*
const_offset %= max_const_offset_plus_one;
if (!offset.id()) {
while (unlikely(excess_offset > UINT32_MAX)) {
address = add64_32(bld, address, Operand::c32(UINT32_MAX));
excess_offset -= UINT32_MAX;
}
if (excess_offset)
offset = bld.copy(bld.def(s1), Operand::c32(excess_offset));
address = add64_const64(bld, address, excess_offset / UINT32_MAX * UINT32_MAX);
if (excess_offset % UINT32_MAX)
offset = bld.copy(bld.def(s1), Operand::c32(excess_offset % UINT32_MAX));
} else {
/* If we add to "offset", we would transform the indended
* "address + u2u64(offset) + u2u64(const_offset)" into
* "address + u2u64(offset + const_offset)", so add to the address.
* This could be more efficient if excess_offset>UINT32_MAX by doing a full 64-bit addition,
* but that should be really rare.
*/
while (excess_offset) {
uint32_t src2 = MIN2(excess_offset, UINT32_MAX);
address = add64_32(bld, address, Operand::c32(src2));
excess_offset -= src2;
}
address = add64_const64(bld, address, excess_offset);
}
if (format == Format::MUBUF) {
@@ -869,8 +874,14 @@ lower_global_address(isel_context* ctx, Builder& bld, uint32_t offset_in, Temp*
if (offset.id() &&
(address.type() == RegType::vgpr ? offset.type() != RegType::sgpr
: add_might_overflow(ctx, offset_src, const_offset))) {
address = add64_32(bld, address, Operand(offset));
offset = Temp();
if (offset.type() == RegType::vgpr && bld.program->gfx_level > GFX6) {
assert(address.type() == RegType::sgpr);
address = add64_const64(bld, address, const_offset);
const_offset = 0;
} else {
address = add64_32(bld, address, Operand(offset));
offset = Temp();
}
}
offset = offset.id() ? offset : bld.copy(bld.def(s1), Operand::zero());
} else if (format == Format::FLAT) {
@@ -950,8 +961,10 @@ global_load_callback(Builder& bld, const LoadEmitInfo& info, unsigned bytes_need
RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
Temp val = rc == info.dst.regClass() ? info.dst : bld.tmp(rc);
if (use_mubuf) {
assert(bld.program->gfx_level == GFX6 || addr.type() != RegType::vgpr);
aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3, 1)};
mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, addr));
mubuf->operands[0] = Operand(get_mubuf_global_rsrc(bld, addr));
if (addr.type() == RegType::vgpr)
mubuf->operands[1] = Operand(addr);
else if (offset.type() == RegType::vgpr)
@@ -2576,11 +2589,11 @@ visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(flat));
} else {
assert(ctx->options->gfx_level == GFX6);
assert(ctx->options->gfx_level == GFX6 || write_address.type() != RegType::vgpr);
aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
Temp rsrc = get_gfx6_global_rsrc(bld, write_address);
Temp rsrc = get_mubuf_global_rsrc(bld, write_address);
aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 4, 0)};
mubuf->operands[0] = Operand(rsrc);
@@ -2713,12 +2726,12 @@ visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(flat));
} else {
assert(ctx->options->gfx_level == GFX6);
assert(ctx->options->gfx_level == GFX6 || addr.type() != RegType::vgpr);
UNUSED aco_opcode image_op;
translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
Temp rsrc = get_gfx6_global_rsrc(bld, addr);
Temp rsrc = get_mubuf_global_rsrc(bld, addr);
aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;