From 0116430d394c2509fedff9f3accce6445349a091 Mon Sep 17 00:00:00 2001
From: Sushma Venkatesh Reddy <sushma.venkatesh.reddy@intel.com>
Date: Tue, 30 Jul 2024 23:04:34 -0700
Subject: [PATCH] intel/brw: Handle 16-bit sampler return payloads

API requires samplers to return 32-bit even though hardware can handle
16-bit floating point, so we detect that case and make more efficient
use of memory BW. This is helping improve performance of encode and
decode tokens during LLM by at least 5% across multiple platforms.

Thank you Kenneth Graunke for suggesting and guiding me throughout
this implementation.

Signed-off-by: Sushma Venkatesh Reddy <sushma.venkatesh.reddy@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30447>
---
 src/intel/compiler/brw_fs_nir.cpp             | 21 +++++++++++++------
 .../compiler/brw_lower_logical_sends.cpp      |  9 +++++---
 src/intel/compiler/brw_nir.c                  |  6 ++++++
 3 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 93669ba4042..f4f6ce65de5 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -8623,7 +8623,12 @@ fs_nir_emit_texture(nir_to_brw_state &ntb,
 
    brw_reg nir_def_reg = get_nir_def(ntb, instr->def);
 
-   brw_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4 + instr->is_sparse);
+   bool is_simd8_16bit = nir_alu_type_get_type_size(instr->dest_type) == 16
+      && bld.dispatch_width() == 8;
+
+   brw_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type),
+      (is_simd8_16bit ? 8 : 4) + instr->is_sparse);
+
    fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
    inst->offset = header_bits;
 
@@ -8635,15 +8640,18 @@ fs_nir_emit_texture(nir_to_brw_state &ntb,
       if (instr->is_sparse) {
          read_size = util_last_bit(write_mask) - 1;
          inst->size_written =
-            read_size * inst->dst.component_size(inst->exec_size) +
+            (is_simd8_16bit ? 2 : 1) * read_size *
+            inst->dst.component_size(inst->exec_size) +
             (reg_unit(devinfo) * REG_SIZE);
       } else {
          read_size = util_last_bit(write_mask);
          inst->size_written =
-            read_size * inst->dst.component_size(inst->exec_size);
+            (is_simd8_16bit ? 2 : 1) * read_size *
+            inst->dst.component_size(inst->exec_size);
       }
    } else {
-      inst->size_written = 4 * inst->dst.component_size(inst->exec_size) +
+      inst->size_written = (is_simd8_16bit ? 2 : 1) * 4 *
+                           inst->dst.component_size(inst->exec_size) +
                            (instr->is_sparse ? (reg_unit(devinfo) * REG_SIZE) : 0);
    }
 
@@ -8666,7 +8674,8 @@ fs_nir_emit_texture(nir_to_brw_state &ntb,
       inst->keep_payload_trailing_zeros = true;
    }
 
-   if (instr->op != nir_texop_query_levels && !instr->is_sparse) {
+   if (instr->op != nir_texop_query_levels && !instr->is_sparse
+      && !is_simd8_16bit) {
       /* In most cases we can write directly to the result. */
       inst->dst = nir_def_reg;
    } else {
@@ -8675,7 +8684,7 @@ fs_nir_emit_texture(nir_to_brw_state &ntb,
        */
       brw_reg nir_dest[5];
       for (unsigned i = 0; i < read_size; i++)
-         nir_dest[i] = offset(dst, bld, i);
+         nir_dest[i] = offset(dst, bld, (is_simd8_16bit ? 2 : 1) * i);
 
       if (instr->op == nir_texop_query_levels) {
          /* # levels is in .w */
diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp
index ad3822fb1e6..4f9165eca1a 100644
--- a/src/intel/compiler/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw_lower_logical_sends.cpp
@@ -1124,13 +1124,16 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst,
    inst->mlen = mlen;
    inst->header_size = header_size;
    inst->sfid = BRW_SFID_SAMPLER;
+   uint sampler_ret_type = brw_type_size_bits(inst->dst.type) == 16
+      ? GFX8_SAMPLER_RETURN_FORMAT_16BITS
+      : GFX8_SAMPLER_RETURN_FORMAT_32BITS;
    if (surface.file == IMM &&
        (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
       inst->desc = brw_sampler_desc(devinfo, surface.ud,
                                     sampler.file == IMM ? sampler.ud % 16 : 0,
                                     msg_type,
                                     simd_mode,
-                                    0 /* return_format unused on gfx7+ */);
+                                    sampler_ret_type);
       inst->src[0] = brw_imm_ud(0);
       inst->src[1] = brw_imm_ud(0);
    } else if (surface_handle.file != BAD_FILE) {
@@ -1140,7 +1143,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst,
                                     sampler.file == IMM ? sampler.ud % 16 : 0,
                                     msg_type,
                                     simd_mode,
-                                    0 /* return_format unused on gfx7+ */);
+                                    sampler_ret_type);
 
       /* For bindless samplers, the entire address is included in the message
        * header so we can leave the portion in the message descriptor 0.
@@ -1166,7 +1169,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst,
                                     0, /* sampler */
                                     msg_type,
                                     simd_mode,
-                                    0 /* return_format unused on gfx7+ */);
+                                    sampler_ret_type);
       const fs_builder ubld = bld.group(1, 0).exec_all();
       brw_reg desc = ubld.vgrf(BRW_TYPE_UD);
       if (surface.equals(sampler)) {
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index f53b85bbec2..9d66809ae77 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -996,6 +996,12 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
 
    OPT(nir_lower_alu_to_scalar, NULL, NULL);
 
+   struct nir_opt_16bit_tex_image_options options = {
+      .rounding_mode = nir_rounding_mode_undef,
+      .opt_tex_dest_types = nir_type_float | nir_type_int | nir_type_uint,
+   };
+   OPT(nir_opt_16bit_tex_image, &options);
+
    if (nir->info.stage == MESA_SHADER_GEOMETRY)
       OPT(nir_lower_gs_intrinsics, 0);