diff --git a/src/gallium/drivers/crocus/crocus_state.c b/src/gallium/drivers/crocus/crocus_state.c index e9b7a24f918..375c3559cdc 100644 --- a/src/gallium/drivers/crocus/crocus_state.c +++ b/src/gallium/drivers/crocus/crocus_state.c @@ -82,6 +82,7 @@ #endif #include "drm-uapi/i915_drm.h" +#include "intel/common/intel_compute_slm.h" #include "intel/common/intel_l3_config.h" #include "intel/common/intel_sample_positions.h" #include "intel/compiler/elk/elk_compiler.h" @@ -8165,8 +8166,8 @@ crocus_upload_compute_state(struct crocus_context *ice, idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads; idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs; idd.BarrierEnable = cs_prog_data->uses_barrier; - idd.SharedLocalMemorySize = elk_encode_slm_size(GFX_VER, - prog_data->total_shared); + idd.SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, + prog_data->total_shared); #if GFX_VERx10 >= 75 idd.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs; #endif diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 4cc840a1cb3..b1e93ad7731 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -96,6 +96,7 @@ #include "util/u_trace_gallium.h" #include "nir.h" #include "intel/common/intel_aux_map.h" +#include "intel/common/intel_compute_slm.h" #include "intel/common/intel_l3_config.h" #include "intel/common/intel_sample_positions.h" #include "intel/ds/intel_tracepoints.h" @@ -311,17 +312,6 @@ translate_wrap(unsigned pipe_wrap) return map[pipe_wrap]; } - -static inline uint32_t -iris_encode_slm_size(unsigned gen, uint32_t bytes) -{ -#if GFX_VER >= 9 - return encode_slm_size(gen, bytes); -#else - return elk_encode_slm_size(gen, bytes); -#endif -} - /** * Allocate space for some indirect state. * @@ -8871,7 +8861,7 @@ iris_upload_compute_walker(struct iris_context *ice, idd.KernelStartPointer = KSP(shader); idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads; idd.SharedLocalMemorySize = - iris_encode_slm_size(GFX_VER, shader->total_shared); + intel_compute_slm_encode_size(GFX_VER, shader->total_shared); idd.SamplerStatePointer = shs->sampler_table.offset; idd.SamplerCount = encode_sampler_count(shader), idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE]; @@ -9029,7 +9019,7 @@ iris_upload_gpgpu_walker(struct iris_context *ice, iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) { idd.SharedLocalMemorySize = - iris_encode_slm_size(GFX_VER, ish->kernel_shared_size + grid->variable_shared_mem); + intel_compute_slm_encode_size(GFX_VER, ish->kernel_shared_size + grid->variable_shared_mem); idd.KernelStartPointer = KSP(shader) + iris_cs_data_prog_offset(cs_data, dispatch.simd_size); idd.SamplerStatePointer = shs->sampler_table.offset; diff --git a/src/intel/blorp/blorp_genX_exec_brw.h b/src/intel/blorp/blorp_genX_exec_brw.h index 2b68ea974d6..97e24c89641 100644 --- a/src/intel/blorp/blorp_genX_exec_brw.h +++ b/src/intel/blorp/blorp_genX_exec_brw.h @@ -26,6 +26,7 @@ #include "blorp_priv.h" #include "dev/intel_device_info.h" +#include "common/intel_compute_slm.h" #include "common/intel_sample_positions.h" #include "common/intel_l3_config.h" #include "genxml/gen_macros.h" @@ -1735,7 +1736,7 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params) .BindingTablePointer = surfaces_offset, .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, .SharedLocalMemorySize = - encode_slm_size(GFX_VER, prog_data->total_shared), + intel_compute_slm_encode_size(GFX_VER, prog_data->total_shared), .PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo), .NumberOfBarriers = cs_prog_data->uses_barrier, }; @@ -1798,8 +1799,8 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params) .BindingTablePointer = surfaces_offset, .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs, .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, - .SharedLocalMemorySize = encode_slm_size(GFX_VER, - prog_data->total_shared), + .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, + prog_data->total_shared), .BarrierEnable = cs_prog_data->uses_barrier, .CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs, diff --git a/src/intel/blorp/blorp_genX_exec_elk.h b/src/intel/blorp/blorp_genX_exec_elk.h index 1b3b12cb74d..6c60986bcd6 100644 --- a/src/intel/blorp/blorp_genX_exec_elk.h +++ b/src/intel/blorp/blorp_genX_exec_elk.h @@ -30,6 +30,7 @@ #include "blorp_priv.h" #include "dev/intel_device_info.h" +#include "common/intel_compute_slm.h" #include "common/intel_sample_positions.h" #include "common/intel_l3_config.h" #include "genxml/gen_macros.h" @@ -2060,8 +2061,8 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params) .BindingTablePointer = surfaces_offset, .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs, .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, - .SharedLocalMemorySize = elk_encode_slm_size(GFX_VER, - prog_data->total_shared), + .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, + prog_data->total_shared), .BarrierEnable = cs_prog_data->uses_barrier, #if GFX_VER >= 8 || GFX_VERx10 == 75 .CrossThreadConstantDataReadLength = diff --git a/src/intel/common/intel_compute_slm.c b/src/intel/common/intel_compute_slm.c new file mode 100644 index 00000000000..e589fd744a3 --- /dev/null +++ b/src/intel/common/intel_compute_slm.c @@ -0,0 +1,57 @@ +/* + * Copyright 2024 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "intel_compute_slm.h" + +#include + +#include "util/macros.h" +#include "util/u_math.h" + +/* Shared Local Memory Size is specified as powers of two, + * and also have a Gen-dependent minimum value if not zero. + */ +uint32_t +intel_compute_slm_calculate_size(unsigned gen, uint32_t bytes) +{ + assert(bytes <= 64 * 1024); + if (bytes > 0) + return MAX2(util_next_power_of_two(bytes), gen >= 9 ? 1024 : 4096); + else + return 0; +} + +uint32_t +intel_compute_slm_encode_size(unsigned gen, uint32_t bytes) +{ + uint32_t slm_size = 0; + + /* Shared Local Memory is specified as powers of two, and encoded in + * INTERFACE_DESCRIPTOR_DATA with the following representations: + * + * Size | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB | + * ------------------------------------------------------------------- + * Gfx7-8 | 0 | none | none | 1 | 2 | 4 | 8 | 16 | + * ------------------------------------------------------------------- + * Gfx9+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + */ + + if (bytes > 0) { + slm_size = intel_compute_slm_calculate_size(gen, bytes); + assert(util_is_power_of_two_nonzero(slm_size)); + + if (gen >= 9) { + /* Turn an exponent of 10 (1024 kB) into 1. */ + assert(slm_size >= 1024); + slm_size = ffs(slm_size) - 10; + } else { + assert(slm_size >= 4096); + /* Convert to the pre-Gfx9 representation. */ + slm_size = slm_size / 4096; + } + } + + return slm_size; +} diff --git a/src/intel/common/intel_compute_slm.h b/src/intel/common/intel_compute_slm.h new file mode 100644 index 00000000000..56866110cf1 --- /dev/null +++ b/src/intel/common/intel_compute_slm.h @@ -0,0 +1,11 @@ +/* + * Copyright 2024 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include + +uint32_t intel_compute_slm_calculate_size(unsigned gen, uint32_t bytes); +uint32_t intel_compute_slm_encode_size(unsigned gen, uint32_t bytes); diff --git a/src/intel/common/meson.build b/src/intel/common/meson.build index 88985063589..d306187727b 100644 --- a/src/intel/common/meson.build +++ b/src/intel/common/meson.build @@ -36,6 +36,8 @@ files_libintel_common = files( 'intel_bind_timeline.c', 'intel_bind_timeline.h', 'intel_buffer_alloc.h', + 'intel_compute_slm.c', + 'intel_compute_slm.h', 'intel_debug_identifier.h', 'intel_debug_identifier.c', 'intel_engine.c', diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index a7d8cb2dcfc..8150fd6f2e1 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -1556,52 +1556,6 @@ void brw_debug_key_recompile(const struct brw_compiler *c, void *log, const struct brw_base_prog_key *old_key, const struct brw_base_prog_key *key); -/* Shared Local Memory Size is specified as powers of two, - * and also have a Gen-dependent minimum value if not zero. - */ -static inline uint32_t -intel_calculate_slm_size(unsigned gen, uint32_t bytes) -{ - assert(bytes <= 64 * 1024); - if (bytes > 0) - return MAX2(util_next_power_of_two(bytes), gen >= 9 ? 1024 : 4096); - else - return 0; -} - -static inline uint32_t -encode_slm_size(unsigned gen, uint32_t bytes) -{ - uint32_t slm_size = 0; - - /* Shared Local Memory is specified as powers of two, and encoded in - * INTERFACE_DESCRIPTOR_DATA with the following representations: - * - * Size | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB | - * ------------------------------------------------------------------- - * Gfx7-8 | 0 | none | none | 1 | 2 | 4 | 8 | 16 | - * ------------------------------------------------------------------- - * Gfx9+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | - */ - - if (bytes > 0) { - slm_size = intel_calculate_slm_size(gen, bytes); - assert(util_is_power_of_two_nonzero(slm_size)); - - if (gen >= 9) { - /* Turn an exponent of 10 (1024 kB) into 1. */ - assert(slm_size >= 1024); - slm_size = ffs(slm_size) - 10; - } else { - assert(slm_size >= 4096); - /* Convert to the pre-Gfx9 representation. */ - slm_size = slm_size / 4096; - } - } - - return slm_size; -} - unsigned brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data, unsigned threads); diff --git a/src/intel/compiler/elk/elk_compiler.h b/src/intel/compiler/elk/elk_compiler.h index 978264a4f3f..1ed7d7d143e 100644 --- a/src/intel/compiler/elk/elk_compiler.h +++ b/src/intel/compiler/elk/elk_compiler.h @@ -1652,46 +1652,6 @@ void elk_debug_key_recompile(const struct elk_compiler *c, void *log, const struct elk_base_prog_key *old_key, const struct elk_base_prog_key *key); -/* Shared Local Memory Size is specified as powers of two, - * and also have a Gen-dependent minimum value if not zero. - */ -static inline uint32_t -elk_calculate_slm_size(unsigned gen, uint32_t bytes) -{ - assert(bytes <= 64 * 1024); - if (bytes > 0) - return MAX2(util_next_power_of_two(bytes), gen >= 9 ? 1024 : 4096); - else - return 0; -} - -static inline uint32_t -elk_encode_slm_size(unsigned gen, uint32_t bytes) -{ - uint32_t slm_size = 0; - - /* Shared Local Memory is specified as powers of two, and encoded in - * INTERFACE_DESCRIPTOR_DATA with the following representations: - * - * Size | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB | - * ------------------------------------------------------------------- - * Gfx7-8 | 0 | none | none | 1 | 2 | 4 | 8 | 16 | - * ------------------------------------------------------------------- - * Gfx9+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | - */ - - if (bytes > 0) { - slm_size = elk_calculate_slm_size(gen, bytes); - assert(util_is_power_of_two_nonzero(slm_size)); - - assert(slm_size >= 4096); - /* Convert to the pre-Gfx9 representation. */ - slm_size = slm_size / 4096; - } - - return slm_size; -} - unsigned elk_cs_push_const_total_size(const struct elk_cs_prog_data *cs_prog_data, unsigned threads); diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 14b6a6ff9e4..dc049697234 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -29,6 +29,7 @@ #include "util/mesa-sha1.h" #include "util/os_time.h" +#include "common/intel_compute_slm.h" #include "common/intel_l3_config.h" #include "common/intel_sample_positions.h" #include "compiler/brw_disasm.h" @@ -1162,7 +1163,7 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline, const unsigned chunk_size = 16; const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size); assert(shared_size <= - intel_calculate_slm_size(compiler->devinfo->ver, nir->info.shared_size)); + intel_compute_slm_calculate_size(compiler->devinfo->ver, nir->info.shared_size)); NIR_PASS(_, nir, nir_zero_initialize_shared_memory, shared_size, chunk_size); diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c index 4d9556a885f..8cbe4594362 100644 --- a/src/intel/vulkan/genX_cmd_compute.c +++ b/src/intel/vulkan/genX_cmd_compute.c @@ -30,6 +30,7 @@ #include "vk_util.h" #include "common/intel_aux_map.h" +#include "common/intel_compute_slm.h" #include "genxml/gen_macros.h" #include "genxml/genX_pack.h" #include "genxml/genX_rt_pack.h" @@ -283,7 +284,7 @@ get_interface_descriptor_data(struct anv_cmd_buffer *cmd_buffer, .BindingTableEntryCount = devinfo->verx10 == 125 ? 0 : 1 + MIN2(shader->bind_map.surface_count, 30), .NumberofThreadsinGPGPUThreadGroup = dispatch->threads, - .SharedLocalMemorySize = encode_slm_size(GFX_VER, prog_data->base.total_shared), + .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, prog_data->base.total_shared), .PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo), .NumberOfBarriers = prog_data->uses_barrier, }; diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index f667c8bacbd..d2609fc26a9 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -27,6 +27,7 @@ #include "genxml/genX_pack.h" #include "genxml/genX_rt_pack.h" +#include "common/intel_compute_slm.h" #include "common/intel_genX_state_brw.h" #include "common/intel_l3_config.h" #include "common/intel_sample_positions.h" @@ -1792,7 +1793,7 @@ emit_task_state(struct anv_graphics_pipeline *pipeline) task.NumberofBarriers = task_prog_data->base.uses_barrier; task.SharedLocalMemorySize = - encode_slm_size(GFX_VER, task_prog_data->base.base.total_shared); + intel_compute_slm_encode_size(GFX_VER, task_prog_data->base.base.total_shared); task.PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo); @@ -1873,7 +1874,7 @@ emit_mesh_state(struct anv_graphics_pipeline *pipeline) mesh.NumberofBarriers = mesh_prog_data->base.uses_barrier; mesh.SharedLocalMemorySize = - encode_slm_size(GFX_VER, mesh_prog_data->base.base.total_shared); + intel_compute_slm_encode_size(GFX_VER, mesh_prog_data->base.base.total_shared); mesh.PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo); @@ -2080,7 +2081,7 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline) 0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30), .BarrierEnable = cs_prog_data->uses_barrier, .SharedLocalMemorySize = - encode_slm_size(GFX_VER, cs_prog_data->base.total_shared), + intel_compute_slm_encode_size(GFX_VER, cs_prog_data->base.total_shared), .ConstantURBEntryReadOffset = 0, .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs, diff --git a/src/intel/vulkan/genX_simple_shader.c b/src/intel/vulkan/genX_simple_shader.c index bfe1ba2b5bf..d950b28d040 100644 --- a/src/intel/vulkan/genX_simple_shader.c +++ b/src/intel/vulkan/genX_simple_shader.c @@ -30,6 +30,7 @@ #include "genxml/gen_macros.h" #include "genxml/genX_pack.h" +#include "common/intel_compute_slm.h" #include "common/intel_genX_state_brw.h" static void @@ -580,8 +581,8 @@ genX(emit_simple_shader_dispatch)(struct anv_simple_shader *state, .BindingTablePointer = 0, .BindingTableEntryCount = 0, .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, - .SharedLocalMemorySize = encode_slm_size(GFX_VER, - prog_data->base.total_shared), + .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, + prog_data->base.total_shared), .NumberOfBarriers = prog_data->uses_barrier, }; } @@ -649,8 +650,8 @@ genX(emit_simple_shader_dispatch)(struct anv_simple_shader *state, .SamplerCount = 0, .BindingTableEntryCount = 0, .BarrierEnable = prog_data->uses_barrier, - .SharedLocalMemorySize = encode_slm_size(GFX_VER, - prog_data->base.total_shared), + .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, + prog_data->base.total_shared), .ConstantURBEntryReadOffset = 0, .ConstantURBEntryReadLength = prog_data->push.per_thread.regs, diff --git a/src/intel/vulkan_hasvk/anv_pipeline.c b/src/intel/vulkan_hasvk/anv_pipeline.c index 570fbd3c442..1d9ca97f363 100644 --- a/src/intel/vulkan_hasvk/anv_pipeline.c +++ b/src/intel/vulkan_hasvk/anv_pipeline.c @@ -29,6 +29,7 @@ #include "util/mesa-sha1.h" #include "util/os_time.h" +#include "common/intel_compute_slm.h" #include "common/intel_l3_config.h" #include "common/intel_sample_positions.h" #include "compiler/elk/elk_disasm.h" @@ -568,7 +569,7 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline, const unsigned chunk_size = 16; const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size); assert(shared_size <= - elk_calculate_slm_size(compiler->devinfo->ver, nir->info.shared_size)); + intel_compute_slm_calculate_size(compiler->devinfo->ver, nir->info.shared_size)); NIR_PASS(_, nir, nir_zero_initialize_shared_memory, shared_size, chunk_size); diff --git a/src/intel/vulkan_hasvk/genX_pipeline.c b/src/intel/vulkan_hasvk/genX_pipeline.c index 132cdc59498..39f87633a3f 100644 --- a/src/intel/vulkan_hasvk/genX_pipeline.c +++ b/src/intel/vulkan_hasvk/genX_pipeline.c @@ -27,6 +27,7 @@ #include "genxml/genX_pack.h" #include "genxml/genX_rt_pack.h" +#include "common/intel_compute_slm.h" #include "common/intel_genX_state_elk.h" #include "common/intel_l3_config.h" #include "common/intel_sample_positions.h" @@ -1939,8 +1940,7 @@ genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline) */ .BindingTableEntryCount = 1 + MIN2(cs_bin->bind_map.surface_count, 30), .BarrierEnable = cs_prog_data->uses_barrier, - .SharedLocalMemorySize = - elk_encode_slm_size(GFX_VER, cs_prog_data->base.total_shared), + .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, cs_prog_data->base.total_shared), #if GFX_VERx10 != 75 .ConstantURBEntryReadOffset = 0,