From 0116430d394c2509fedff9f3accce6445349a091 Mon Sep 17 00:00:00 2001 From: Sushma Venkatesh Reddy Date: Tue, 30 Jul 2024 23:04:34 -0700 Subject: [PATCH] intel/brw: Handle 16-bit sampler return payloads API requires samplers to return 32-bit even though hardware can handle 16-bit floating point, so we detect that case and make more efficient use of memory BW. This is helping improve performance of encode and decode tokens during LLM by at least 5% across multiple platforms. Thank you Kenneth Graunke for suggesting and guiding me throughout this implementation. Signed-off-by: Sushma Venkatesh Reddy Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_fs_nir.cpp | 21 +++++++++++++------ .../compiler/brw_lower_logical_sends.cpp | 9 +++++--- src/intel/compiler/brw_nir.c | 6 ++++++ 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 93669ba4042..f4f6ce65de5 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -8623,7 +8623,12 @@ fs_nir_emit_texture(nir_to_brw_state &ntb, brw_reg nir_def_reg = get_nir_def(ntb, instr->def); - brw_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4 + instr->is_sparse); + bool is_simd8_16bit = nir_alu_type_get_type_size(instr->dest_type) == 16 + && bld.dispatch_width() == 8; + + brw_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), + (is_simd8_16bit ? 8 : 4) + instr->is_sparse); + fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs)); inst->offset = header_bits; @@ -8635,15 +8640,18 @@ fs_nir_emit_texture(nir_to_brw_state &ntb, if (instr->is_sparse) { read_size = util_last_bit(write_mask) - 1; inst->size_written = - read_size * inst->dst.component_size(inst->exec_size) + + (is_simd8_16bit ? 2 : 1) * read_size * + inst->dst.component_size(inst->exec_size) + (reg_unit(devinfo) * REG_SIZE); } else { read_size = util_last_bit(write_mask); inst->size_written = - read_size * inst->dst.component_size(inst->exec_size); + (is_simd8_16bit ? 2 : 1) * read_size * + inst->dst.component_size(inst->exec_size); } } else { - inst->size_written = 4 * inst->dst.component_size(inst->exec_size) + + inst->size_written = (is_simd8_16bit ? 2 : 1) * 4 * + inst->dst.component_size(inst->exec_size) + (instr->is_sparse ? (reg_unit(devinfo) * REG_SIZE) : 0); } @@ -8666,7 +8674,8 @@ fs_nir_emit_texture(nir_to_brw_state &ntb, inst->keep_payload_trailing_zeros = true; } - if (instr->op != nir_texop_query_levels && !instr->is_sparse) { + if (instr->op != nir_texop_query_levels && !instr->is_sparse + && !is_simd8_16bit) { /* In most cases we can write directly to the result. */ inst->dst = nir_def_reg; } else { @@ -8675,7 +8684,7 @@ fs_nir_emit_texture(nir_to_brw_state &ntb, */ brw_reg nir_dest[5]; for (unsigned i = 0; i < read_size; i++) - nir_dest[i] = offset(dst, bld, i); + nir_dest[i] = offset(dst, bld, (is_simd8_16bit ? 2 : 1) * i); if (instr->op == nir_texop_query_levels) { /* # levels is in .w */ diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp index ad3822fb1e6..4f9165eca1a 100644 --- a/src/intel/compiler/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw_lower_logical_sends.cpp @@ -1124,13 +1124,16 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, inst->mlen = mlen; inst->header_size = header_size; inst->sfid = BRW_SFID_SAMPLER; + uint sampler_ret_type = brw_type_size_bits(inst->dst.type) == 16 + ? GFX8_SAMPLER_RETURN_FORMAT_16BITS + : GFX8_SAMPLER_RETURN_FORMAT_32BITS; if (surface.file == IMM && (sampler.file == IMM || sampler_handle.file != BAD_FILE)) { inst->desc = brw_sampler_desc(devinfo, surface.ud, sampler.file == IMM ? sampler.ud % 16 : 0, msg_type, simd_mode, - 0 /* return_format unused on gfx7+ */); + sampler_ret_type); inst->src[0] = brw_imm_ud(0); inst->src[1] = brw_imm_ud(0); } else if (surface_handle.file != BAD_FILE) { @@ -1140,7 +1143,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, sampler.file == IMM ? sampler.ud % 16 : 0, msg_type, simd_mode, - 0 /* return_format unused on gfx7+ */); + sampler_ret_type); /* For bindless samplers, the entire address is included in the message * header so we can leave the portion in the message descriptor 0. @@ -1166,7 +1169,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, 0, /* sampler */ msg_type, simd_mode, - 0 /* return_format unused on gfx7+ */); + sampler_ret_type); const fs_builder ubld = bld.group(1, 0).exec_all(); brw_reg desc = ubld.vgrf(BRW_TYPE_UD); if (surface.equals(sampler)) { diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index f53b85bbec2..9d66809ae77 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -996,6 +996,12 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir, OPT(nir_lower_alu_to_scalar, NULL, NULL); + struct nir_opt_16bit_tex_image_options options = { + .rounding_mode = nir_rounding_mode_undef, + .opt_tex_dest_types = nir_type_float | nir_type_int | nir_type_uint, + }; + OPT(nir_opt_16bit_tex_image, &options); + if (nir->info.stage == MESA_SHADER_GEOMETRY) OPT(nir_lower_gs_intrinsics, 0);