From c70dcd1451b62a9c6cffa35ab04b9eae75fcd655 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Wed, 2 Apr 2025 14:43:05 +0200 Subject: [PATCH] aco/gfx9+: use d16 global/scratch/buffer loads Full register loads are not nessecary and prevent packing optimizations. Global/Scratch is GFX9+ so D16 loads are always supported. We already used LDS D16 loads. Foz-DB Navi31(mostly RA noise): Totals from 716 (0.90% of 79789) affected shaders: Instrs: 3854176 -> 3854238 (+0.00%); split: -0.00%, +0.00% CodeSize: 20034440 -> 20035220 (+0.00%); split: -0.00%, +0.00% Latency: 24410951 -> 24411120 (+0.00%) InvThroughput: 5181276 -> 5181301 (+0.00%) Copies: 320258 -> 320317 (+0.02%) VALU: 2207307 -> 2207366 (+0.00%) Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index bea6416cadd..873d7886fc8 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -4587,10 +4587,12 @@ mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigne aco_opcode op; if (bytes_needed == 1 || align_ % 2) { bytes_size = 1; - op = aco_opcode::buffer_load_ubyte; + op = bld.program->gfx_level >= GFX9 ? aco_opcode::buffer_load_ubyte_d16 + : aco_opcode::buffer_load_ubyte; } else if (bytes_needed == 2 || align_ % 4) { bytes_size = 2; - op = aco_opcode::buffer_load_ushort; + op = bld.program->gfx_level >= GFX9 ? aco_opcode::buffer_load_short_d16 + : aco_opcode::buffer_load_ushort; } else if (bytes_needed <= 4) { bytes_size = 4; op = aco_opcode::buffer_load_dword; @@ -4695,10 +4697,10 @@ scratch_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsig aco_opcode op; if (bytes_needed == 1 || align_ % 2u) { bytes_size = 1; - op = aco_opcode::scratch_load_ubyte; + op = aco_opcode::scratch_load_ubyte_d16; } else if (bytes_needed == 2 || align_ % 4u) { bytes_size = 2; - op = aco_opcode::scratch_load_ushort; + op = aco_opcode::scratch_load_short_d16; } else if (bytes_needed <= 4) { bytes_size = 4; op = aco_opcode::scratch_load_dword; @@ -4849,12 +4851,12 @@ global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsign if (bytes_needed == 1 || align_ % 2u) { bytes_size = 1; op = use_mubuf ? aco_opcode::buffer_load_ubyte - : global ? aco_opcode::global_load_ubyte + : global ? aco_opcode::global_load_ubyte_d16 : aco_opcode::flat_load_ubyte; } else if (bytes_needed == 2 || align_ % 4u) { bytes_size = 2; op = use_mubuf ? aco_opcode::buffer_load_ushort - : global ? aco_opcode::global_load_ushort + : global ? aco_opcode::global_load_short_d16 : aco_opcode::flat_load_ushort; } else if (bytes_needed <= 4) { bytes_size = 4;