From 7a0838ebb16007e5e7983ddcf9c95e34f03f0fde Mon Sep 17 00:00:00 2001
From: David Heidelberg <david@ixit.cz>
Date: Sun, 1 Sep 2024 17:07:48 +0900
Subject: [PATCH] panfrost/midgard: Implement nir_lower_mem_access_bit_sizes
 pass

Needed for OpenCL support through Rusticl.

Acked-by: Eric R. Smith <eric.smith@collabora.com>
Signed-off-by: David Heidelberg <david@ixit.cz>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30963>
---
 src/panfrost/midgard/midgard_compile.c | 68 ++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c
index be3989c67c1..912f6451fb4 100644
--- a/src/panfrost/midgard/midgard_compile.c
+++ b/src/panfrost/midgard/midgard_compile.c
@@ -323,6 +323,60 @@ midgard_vectorize_filter(const nir_instr *instr, const void *data)
    return 4;
 }
 
+static nir_mem_access_size_align
+mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
+                         uint8_t bit_size, uint32_t align_mul,
+                         uint32_t align_offset, bool offset_is_const,
+                         const void *cb_data)
+{
+   uint32_t align = nir_combined_align(align_mul, align_offset);
+   assert(util_is_power_of_two_nonzero(align));
+
+   /* No more than 16 bytes at a time. */
+   bytes = MIN2(bytes, 16);
+
+   /* If the number of bytes is a multiple of 4, use 32-bit loads. Else if it's
+    * a multiple of 2, use 16-bit loads. Else use 8-bit loads.
+    *
+    * But if we're only aligned to 1 byte, use 8-bit loads. If we're only
+    * aligned to 2 bytes, use 16-bit loads, unless we needed 8-bit loads due to
+    * the size.
+    */
+   if ((bytes & 1) || (align == 1))
+      bit_size = 8;
+   else if ((bytes & 2) || (align == 2))
+      bit_size = 16;
+   else if (bit_size >= 32)
+      bit_size = 32;
+
+   unsigned num_comps = MIN2(bytes / (bit_size / 8), 4);
+
+   /* Push constants require 32-bit loads. */
+   if (intrin == nir_intrinsic_load_push_constant) {
+      if (align_mul >= 4) {
+         /* If align_mul is bigger than 4 we can use align_offset to find
+          * the exact number of words we need to read.
+          */
+         num_comps = DIV_ROUND_UP((align_offset % 4) + bytes, 4);
+      } else {
+         /* If bytes is aligned on 32-bit, the access might still cross one
+          * word at the beginning, and one word at the end. If bytes is not
+          * aligned on 32-bit, the extra two words should cover for both the
+          * size and offset mis-alignment.
+          */
+         num_comps = (bytes / 4) + 2;
+      }
+
+      bit_size = MIN2(bit_size, 32);
+   }
+
+   return (nir_mem_access_size_align){
+      .num_components = num_comps,
+      .bit_size = bit_size,
+      .align = bit_size / 8,
+   };
+}
+
 void
 midgard_preprocess_nir(nir_shader *nir, unsigned gpu_id)
 {
@@ -359,6 +413,20 @@ midgard_preprocess_nir(nir_shader *nir, unsigned gpu_id)
       NIR_PASS_V(nir, pan_nir_lower_store_component);
    }
 
+   /* Could be eventually useful for Vulkan, but we don't expect it to have
+    * the support, so limit it to compute */
+   if (gl_shader_stage_is_compute(nir->info.stage)) {
+      nir_lower_mem_access_bit_sizes_options mem_size_options = {
+         .modes = nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_ssbo |
+                  nir_var_mem_constant | nir_var_mem_task_payload |
+                  nir_var_shader_temp | nir_var_function_temp |
+                  nir_var_mem_global | nir_var_mem_shared,
+         .callback = mem_access_size_align_cb,
+      };
+
+      NIR_PASS_V(nir, nir_lower_mem_access_bit_sizes, &mem_size_options);
+   }
+
    NIR_PASS_V(nir, nir_lower_ssbo, NULL);
    NIR_PASS_V(nir, pan_nir_lower_zs_store);