From 1f9e44c181d04e40e2247f4a4e52507110d3af2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Sat, 18 Feb 2023 13:42:53 +0100 Subject: [PATCH] aco: Disable MUBUF/MTBUF offsets when they are zero. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fossil DB stats on Rembrandt (GFX10.3): Totals from 1264 (0.94% of 134920) affected shaders: VGPRs: 69504 -> 69336 (-0.24%) CodeSize: 6885468 -> 6886224 (+0.01%); split: -0.02%, +0.03% MaxWaves: 24632 -> 24670 (+0.15%) Instrs: 1287027 -> 1287209 (+0.01%); split: -0.04%, +0.05% Latency: 6830411 -> 6831165 (+0.01%); split: -0.06%, +0.07% InvThroughput: 1220643 -> 1220438 (-0.02%); split: -0.04%, +0.02% VClause: 24737 -> 24751 (+0.06%); split: -0.25%, +0.30% SClause: 42774 -> 42911 (+0.32%); split: -0.13%, +0.45% Copies: 75408 -> 75600 (+0.25%); split: -0.62%, +0.88% PreVGPRs: 60544 -> 59809 (-1.21%) Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- .../compiler/aco_instruction_selection.cpp | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index af5d1336649..121ca1841b9 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -4088,6 +4088,8 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info, Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp(); if (offset.isConstant()) { offset = Operand::c32(offset.constantValue() + to_add); + } else if (offset.isUndefined()) { + offset = Operand::c32(to_add); } else if (offset_tmp.regClass() == s1) { offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp, Operand::c32(to_add)); @@ -4122,6 +4124,8 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info, Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp(); if (offset.isConstant()) { aligned_offset = Operand::c32(offset.constantValue() & 0xfffffffcu); + } else if (offset.isUndefined()) { + aligned_offset = Operand::zero(); } else if (offset_tmp.regClass() == s1) { aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand::c32(0xfffffffcu), offset_tmp); @@ -4139,7 +4143,8 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info, } } Temp aligned_offset_tmp = - aligned_offset.isTemp() ? aligned_offset.getTemp() : bld.copy(bld.def(s1), aligned_offset); + aligned_offset.isTemp() ? aligned_offset.getTemp() : + aligned_offset.isConstant() ? bld.copy(bld.def(s1), aligned_offset) : Temp(0, s1); Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align, reduced_const_offset, byte_align ? Temp() : info.dst); @@ -4157,6 +4162,8 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info, if (byte_align == -1) { if (offset.isConstant()) byte_align_off = Operand::c32(offset.constantValue() % 4u); + else if (offset.isUndefined()) + byte_align_off = Operand::zero(); else if (offset.size() == 2) byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0, RegClass(offset.getTemp().type(), 1))); @@ -7032,11 +7039,13 @@ visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin) Builder bld(ctx->program, ctx->block); bool idxen = !nir_src_is_const(intrin->src[3]) || nir_src_as_uint(intrin->src[3]); + bool v_offset_zero = nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]); bool s_offset_zero = nir_src_is_const(intrin->src[2]) && !nir_src_as_uint(intrin->src[2]); Temp dst = get_ssa_temp(ctx, &intrin->dest.ssa); Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa)); - Temp v_offset = as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa)); + Temp v_offset = + v_offset_zero ? Temp(0, v1) : as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa)); Temp s_offset = s_offset_zero ? Temp(0, s1) : bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa)); Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[3].ssa)) : Temp(); @@ -7100,11 +7109,15 @@ visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin) Builder bld(ctx->program, ctx->block); bool idxen = !nir_src_is_const(intrin->src[4]) || nir_src_as_uint(intrin->src[4]); + bool v_offset_zero = nir_src_is_const(intrin->src[2]) && !nir_src_as_uint(intrin->src[2]); + bool s_offset_zero = nir_src_is_const(intrin->src[3]) && !nir_src_as_uint(intrin->src[3]); Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa); Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[1].ssa)); - Temp v_offset = as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[2].ssa)); - Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[3].ssa)); + Temp v_offset = + v_offset_zero ? Temp(0, v1) : as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[2].ssa)); + Temp s_offset = + s_offset_zero ? Temp(0, s1) : bld.as_uniform(get_ssa_temp(ctx, intrin->src[3].ssa)); Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[4].ssa)) : Temp(); bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;