intel: Move slm functions from brw_compiler.h to intel_compute_slm.c/h

This functions were inlined in a header and duplicated between brw and elk. That would be enough reasons to move to a C file but next patches will add more code to support Xe2 platforms, what would cause more code to be inlined, duplicating even more code and increasing lib size. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Signed-off-by: José Roberto de Souza <jose.souza@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28910>
2024-04-19 11:29:22 -07:00
parent 357dde47a5
commit f5f71bae02
15 changed files with 100 additions and 118 deletions
@@ -82,6 +82,7 @@
 #endif

 #include "drm-uapi/i915_drm.h"
+#include "intel/common/intel_compute_slm.h"
 #include "intel/common/intel_l3_config.h"
 #include "intel/common/intel_sample_positions.h"
 #include "intel/compiler/elk/elk_compiler.h"
@@ -8165,8 +8166,8 @@ crocus_upload_compute_state(struct crocus_context *ice,
         idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
         idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs;
         idd.BarrierEnable = cs_prog_data->uses_barrier;
-         idd.SharedLocalMemorySize = elk_encode_slm_size(GFX_VER,
-                                                         prog_data->total_shared);
+         idd.SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER,
+                                                                   prog_data->total_shared);
 #if GFX_VERx10 >= 75
         idd.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs;
 #endif
@@ -96,6 +96,7 @@
 #include "util/u_trace_gallium.h"
 #include "nir.h"
 #include "intel/common/intel_aux_map.h"
+#include "intel/common/intel_compute_slm.h"
 #include "intel/common/intel_l3_config.h"
 #include "intel/common/intel_sample_positions.h"
 #include "intel/ds/intel_tracepoints.h"
@@ -311,17 +312,6 @@ translate_wrap(unsigned pipe_wrap)
   return map[pipe_wrap];
 }

-
-static inline uint32_t
-iris_encode_slm_size(unsigned gen, uint32_t bytes)
-{
-#if GFX_VER >= 9
-   return encode_slm_size(gen, bytes);
-#else
-   return elk_encode_slm_size(gen, bytes);
-#endif
-}
-
 /**
 * Allocate space for some indirect state.
 *
@@ -8871,7 +8861,7 @@ iris_upload_compute_walker(struct iris_context *ice,
   idd.KernelStartPointer = KSP(shader);
   idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
   idd.SharedLocalMemorySize =
-      iris_encode_slm_size(GFX_VER, shader->total_shared);
+      intel_compute_slm_encode_size(GFX_VER, shader->total_shared);
   idd.SamplerStatePointer = shs->sampler_table.offset;
   idd.SamplerCount = encode_sampler_count(shader),
   idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];
@@ -9029,7 +9019,7 @@ iris_upload_gpgpu_walker(struct iris_context *ice,

      iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
         idd.SharedLocalMemorySize =
-            iris_encode_slm_size(GFX_VER, ish->kernel_shared_size + grid->variable_shared_mem);
+            intel_compute_slm_encode_size(GFX_VER, ish->kernel_shared_size + grid->variable_shared_mem);
         idd.KernelStartPointer =
            KSP(shader) + iris_cs_data_prog_offset(cs_data, dispatch.simd_size);
         idd.SamplerStatePointer = shs->sampler_table.offset;
@@ -26,6 +26,7 @@

 #include "blorp_priv.h"
 #include "dev/intel_device_info.h"
+#include "common/intel_compute_slm.h"
 #include "common/intel_sample_positions.h"
 #include "common/intel_l3_config.h"
 #include "genxml/gen_macros.h"
@@ -1735,7 +1736,7 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
         .BindingTablePointer = surfaces_offset,
         .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
         .SharedLocalMemorySize =
-            encode_slm_size(GFX_VER, prog_data->total_shared),
+            intel_compute_slm_encode_size(GFX_VER, prog_data->total_shared),
         .PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo),
         .NumberOfBarriers = cs_prog_data->uses_barrier,
      };
@@ -1798,8 +1799,8 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
      .BindingTablePointer = surfaces_offset,
      .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
      .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
-      .SharedLocalMemorySize = encode_slm_size(GFX_VER,
-                                               prog_data->total_shared),
+      .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER,
+                                                             prog_data->total_shared),
      .BarrierEnable = cs_prog_data->uses_barrier,
      .CrossThreadConstantDataReadLength =
         cs_prog_data->push.cross_thread.regs,
@@ -30,6 +30,7 @@

 #include "blorp_priv.h"
 #include "dev/intel_device_info.h"
+#include "common/intel_compute_slm.h"
 #include "common/intel_sample_positions.h"
 #include "common/intel_l3_config.h"
 #include "genxml/gen_macros.h"
@@ -2060,8 +2061,8 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
      .BindingTablePointer = surfaces_offset,
      .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
      .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
-      .SharedLocalMemorySize = elk_encode_slm_size(GFX_VER,
-                                                   prog_data->total_shared),
+      .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER,
+                                                             prog_data->total_shared),
      .BarrierEnable = cs_prog_data->uses_barrier,
 #if GFX_VER >= 8 || GFX_VERx10 == 75
      .CrossThreadConstantDataReadLength =
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2024 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "intel_compute_slm.h"
+
+#include <assert.h>
+
+#include "util/macros.h"
+#include "util/u_math.h"
+
+/* Shared Local Memory Size is specified as powers of two,
+ * and also have a Gen-dependent minimum value if not zero.
+ */
+uint32_t
+intel_compute_slm_calculate_size(unsigned gen, uint32_t bytes)
+{
+   assert(bytes <= 64 * 1024);
+   if (bytes > 0)
+      return MAX2(util_next_power_of_two(bytes), gen >= 9 ? 1024 : 4096);
+   else
+      return 0;
+}
+
+uint32_t
+intel_compute_slm_encode_size(unsigned gen, uint32_t bytes)
+{
+   uint32_t slm_size = 0;
+
+   /* Shared Local Memory is specified as powers of two, and encoded in
+    * INTERFACE_DESCRIPTOR_DATA with the following representations:
+    *
+    * Size   | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB |
+    * -------------------------------------------------------------------
+    * Gfx7-8 |    0 | none | none |    1 |    2 |     4 |     8 |    16 |
+    * -------------------------------------------------------------------
+    * Gfx9+  |    0 |    1 |    2 |    3 |    4 |     5 |     6 |     7 |
+    */
+
+   if (bytes > 0) {
+      slm_size = intel_compute_slm_calculate_size(gen, bytes);
+      assert(util_is_power_of_two_nonzero(slm_size));
+
+      if (gen >= 9) {
+         /* Turn an exponent of 10 (1024 kB) into 1. */
+         assert(slm_size >= 1024);
+         slm_size = ffs(slm_size) - 10;
+      } else {
+         assert(slm_size >= 4096);
+         /* Convert to the pre-Gfx9 representation. */
+         slm_size = slm_size / 4096;
+      }
+   }
+
+   return slm_size;
+}
@@ -0,0 +1,11 @@
+/*
+ * Copyright 2024 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+uint32_t intel_compute_slm_calculate_size(unsigned gen, uint32_t bytes);
+uint32_t intel_compute_slm_encode_size(unsigned gen, uint32_t bytes);
@@ -36,6 +36,8 @@ files_libintel_common = files(
  'intel_bind_timeline.c',
  'intel_bind_timeline.h',
  'intel_buffer_alloc.h',
+  'intel_compute_slm.c',
+  'intel_compute_slm.h',
  'intel_debug_identifier.h',
  'intel_debug_identifier.c',
  'intel_engine.c',
@@ -1556,52 +1556,6 @@ void brw_debug_key_recompile(const struct brw_compiler *c, void *log,
                             const struct brw_base_prog_key *old_key,
                             const struct brw_base_prog_key *key);

-/* Shared Local Memory Size is specified as powers of two,
- * and also have a Gen-dependent minimum value if not zero.
- */
-static inline uint32_t
-intel_calculate_slm_size(unsigned gen, uint32_t bytes)
-{
-   assert(bytes <= 64 * 1024);
-   if (bytes > 0)
-      return MAX2(util_next_power_of_two(bytes), gen >= 9 ? 1024 : 4096);
-   else
-      return 0;
-}
-
-static inline uint32_t
-encode_slm_size(unsigned gen, uint32_t bytes)
-{
-   uint32_t slm_size = 0;
-
-   /* Shared Local Memory is specified as powers of two, and encoded in
-    * INTERFACE_DESCRIPTOR_DATA with the following representations:
-    *
-    * Size   | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB |
-    * -------------------------------------------------------------------
-    * Gfx7-8 |    0 | none | none |    1 |    2 |     4 |     8 |    16 |
-    * -------------------------------------------------------------------
-    * Gfx9+  |    0 |    1 |    2 |    3 |    4 |     5 |     6 |     7 |
-    */
-
-   if (bytes > 0) {
-      slm_size = intel_calculate_slm_size(gen, bytes);
-      assert(util_is_power_of_two_nonzero(slm_size));
-
-      if (gen >= 9) {
-         /* Turn an exponent of 10 (1024 kB) into 1. */
-         assert(slm_size >= 1024);
-         slm_size = ffs(slm_size) - 10;
-      } else {
-         assert(slm_size >= 4096);
-         /* Convert to the pre-Gfx9 representation. */
-         slm_size = slm_size / 4096;
-      }
-   }
-
-   return slm_size;
-}
-
 unsigned
 brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
                             unsigned threads);
@@ -1652,46 +1652,6 @@ void elk_debug_key_recompile(const struct elk_compiler *c, void *log,
                             const struct elk_base_prog_key *old_key,
                             const struct elk_base_prog_key *key);

-/* Shared Local Memory Size is specified as powers of two,
- * and also have a Gen-dependent minimum value if not zero.
- */
-static inline uint32_t
-elk_calculate_slm_size(unsigned gen, uint32_t bytes)
-{
-   assert(bytes <= 64 * 1024);
-   if (bytes > 0)
-      return MAX2(util_next_power_of_two(bytes), gen >= 9 ? 1024 : 4096);
-   else
-      return 0;
-}
-
-static inline uint32_t
-elk_encode_slm_size(unsigned gen, uint32_t bytes)
-{
-   uint32_t slm_size = 0;
-
-   /* Shared Local Memory is specified as powers of two, and encoded in
-    * INTERFACE_DESCRIPTOR_DATA with the following representations:
-    *
-    * Size   | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB |
-    * -------------------------------------------------------------------
-    * Gfx7-8 |    0 | none | none |    1 |    2 |     4 |     8 |    16 |
-    * -------------------------------------------------------------------
-    * Gfx9+  |    0 |    1 |    2 |    3 |    4 |     5 |     6 |     7 |
-    */
-
-   if (bytes > 0) {
-      slm_size = elk_calculate_slm_size(gen, bytes);
-      assert(util_is_power_of_two_nonzero(slm_size));
-
-      assert(slm_size >= 4096);
-      /* Convert to the pre-Gfx9 representation. */
-      slm_size = slm_size / 4096;
-   }
-
-   return slm_size;
-}
-
 unsigned
 elk_cs_push_const_total_size(const struct elk_cs_prog_data *cs_prog_data,
                             unsigned threads);
@@ -29,6 +29,7 @@

 #include "util/mesa-sha1.h"
 #include "util/os_time.h"
+#include "common/intel_compute_slm.h"
 #include "common/intel_l3_config.h"
 #include "common/intel_sample_positions.h"
 #include "compiler/brw_disasm.h"
@@ -1162,7 +1163,7 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline,
         const unsigned chunk_size = 16;
         const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
         assert(shared_size <=
-                intel_calculate_slm_size(compiler->devinfo->ver, nir->info.shared_size));
+                intel_compute_slm_calculate_size(compiler->devinfo->ver, nir->info.shared_size));

         NIR_PASS(_, nir, nir_zero_initialize_shared_memory,
                  shared_size, chunk_size);
@@ -30,6 +30,7 @@
 #include "vk_util.h"

 #include "common/intel_aux_map.h"
+#include "common/intel_compute_slm.h"
 #include "genxml/gen_macros.h"
 #include "genxml/genX_pack.h"
 #include "genxml/genX_rt_pack.h"
@@ -283,7 +284,7 @@ get_interface_descriptor_data(struct anv_cmd_buffer *cmd_buffer,
      .BindingTableEntryCount = devinfo->verx10 == 125 ?
         0 : 1 + MIN2(shader->bind_map.surface_count, 30),
      .NumberofThreadsinGPGPUThreadGroup = dispatch->threads,
-      .SharedLocalMemorySize = encode_slm_size(GFX_VER, prog_data->base.total_shared),
+      .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, prog_data->base.total_shared),
      .PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo),
      .NumberOfBarriers = prog_data->uses_barrier,
   };
@@ -27,6 +27,7 @@
 #include "genxml/genX_pack.h"
 #include "genxml/genX_rt_pack.h"

+#include "common/intel_compute_slm.h"
 #include "common/intel_genX_state_brw.h"
 #include "common/intel_l3_config.h"
 #include "common/intel_sample_positions.h"
@@ -1792,7 +1793,7 @@ emit_task_state(struct anv_graphics_pipeline *pipeline)

      task.NumberofBarriers                  = task_prog_data->base.uses_barrier;
      task.SharedLocalMemorySize             =
-         encode_slm_size(GFX_VER, task_prog_data->base.base.total_shared);
+         intel_compute_slm_encode_size(GFX_VER, task_prog_data->base.base.total_shared);
      task.PreferredSLMAllocationSize        =
         preferred_slm_allocation_size(devinfo);

@@ -1873,7 +1874,7 @@ emit_mesh_state(struct anv_graphics_pipeline *pipeline)

      mesh.NumberofBarriers                  = mesh_prog_data->base.uses_barrier;
      mesh.SharedLocalMemorySize             =
-         encode_slm_size(GFX_VER, mesh_prog_data->base.base.total_shared);
+         intel_compute_slm_encode_size(GFX_VER, mesh_prog_data->base.base.total_shared);
      mesh.PreferredSLMAllocationSize        =
         preferred_slm_allocation_size(devinfo);

@@ -2080,7 +2081,7 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
         0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
      .BarrierEnable          = cs_prog_data->uses_barrier,
      .SharedLocalMemorySize  =
-         encode_slm_size(GFX_VER, cs_prog_data->base.total_shared),
+         intel_compute_slm_encode_size(GFX_VER, cs_prog_data->base.total_shared),

      .ConstantURBEntryReadOffset = 0,
      .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
@@ -30,6 +30,7 @@

 #include "genxml/gen_macros.h"
 #include "genxml/genX_pack.h"
+#include "common/intel_compute_slm.h"
 #include "common/intel_genX_state_brw.h"

 static void
@@ -580,8 +581,8 @@ genX(emit_simple_shader_dispatch)(struct anv_simple_shader *state,
            .BindingTablePointer               = 0,
            .BindingTableEntryCount            = 0,
            .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
-            .SharedLocalMemorySize             = encode_slm_size(GFX_VER,
-                                                                 prog_data->base.total_shared),
+            .SharedLocalMemorySize             = intel_compute_slm_encode_size(GFX_VER,
+                                                                               prog_data->base.total_shared),
            .NumberOfBarriers                  = prog_data->uses_barrier,
         };
      }
@@ -649,8 +650,8 @@ genX(emit_simple_shader_dispatch)(struct anv_simple_shader *state,
         .SamplerCount                          = 0,
         .BindingTableEntryCount                = 0,
         .BarrierEnable                         = prog_data->uses_barrier,
-         .SharedLocalMemorySize                 = encode_slm_size(GFX_VER,
-                                                                  prog_data->base.total_shared),
+         .SharedLocalMemorySize                 = intel_compute_slm_encode_size(GFX_VER,
+                                                                                prog_data->base.total_shared),

         .ConstantURBEntryReadOffset            = 0,
         .ConstantURBEntryReadLength            = prog_data->push.per_thread.regs,
@@ -29,6 +29,7 @@

 #include "util/mesa-sha1.h"
 #include "util/os_time.h"
+#include "common/intel_compute_slm.h"
 #include "common/intel_l3_config.h"
 #include "common/intel_sample_positions.h"
 #include "compiler/elk/elk_disasm.h"
@@ -568,7 +569,7 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline,
         const unsigned chunk_size = 16;
         const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
         assert(shared_size <=
-                elk_calculate_slm_size(compiler->devinfo->ver, nir->info.shared_size));
+                intel_compute_slm_calculate_size(compiler->devinfo->ver, nir->info.shared_size));

         NIR_PASS(_, nir, nir_zero_initialize_shared_memory,
                  shared_size, chunk_size);
@@ -27,6 +27,7 @@
 #include "genxml/genX_pack.h"
 #include "genxml/genX_rt_pack.h"

+#include "common/intel_compute_slm.h"
 #include "common/intel_genX_state_elk.h"
 #include "common/intel_l3_config.h"
 #include "common/intel_sample_positions.h"
@@ -1939,8 +1940,7 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
       */
      .BindingTableEntryCount = 1 + MIN2(cs_bin->bind_map.surface_count, 30),
      .BarrierEnable          = cs_prog_data->uses_barrier,
-      .SharedLocalMemorySize  =
-         elk_encode_slm_size(GFX_VER, cs_prog_data->base.total_shared),
+      .SharedLocalMemorySize  = intel_compute_slm_encode_size(GFX_VER, cs_prog_data->base.total_shared),

 #if GFX_VERx10 != 75
      .ConstantURBEntryReadOffset = 0,