diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c new file mode 100644 index 00000000000..ec549d48584 --- /dev/null +++ b/src/broadcom/common/v3d_util.c @@ -0,0 +1,81 @@ +/* + * Copyright © 2021 Raspberry Pi + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "v3d_util.h" +#include "util/macros.h" + +/* Choose a number of workgroups per supergroup that maximizes + * lane occupancy. We can pack up to 16 workgroups into a supergroup. + */ +uint32_t +v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, + bool has_tsy_barrier, + uint32_t threads, + uint32_t num_wgs, + uint32_t wg_size) +{ + /* Compute maximum number of batches in a supergroup for this workgroup size. + * Each batch is 16 elements, and we can have up to 16 work groups in a + * supergroup: + * + * max_batches_per_sg = (wg_size * max_wgs_per_sg) / elements_per_batch + * since max_wgs_per_sg = 16 and elements_per_batch = 16, we get: + * max_batches_per_sg = wg_size + */ + uint32_t max_batches_per_sg = wg_size; + + /* QPU threads will stall at TSY barriers until the entire supergroup + * reaches the barrier. Limit the supergroup size to half the QPU threads + * available, so we can have at least 2 supergroups executing in parallel + * and we don't stall all our QPU threads when a supergroup hits a barrier. + */ + if (has_tsy_barrier) { + uint32_t max_qpu_threads = devinfo->qpu_count * threads; + max_batches_per_sg = MIN2(max_batches_per_sg, max_qpu_threads / 2); + } + uint32_t max_wgs_per_sg = max_batches_per_sg * 16 / wg_size; + + uint32_t best_wgs_per_sg = 1; + uint32_t best_unused_lanes = 16; + for (uint32_t wgs_per_sg = 1; wgs_per_sg <= max_wgs_per_sg; wgs_per_sg++) { + /* Don't try to pack more workgroups per supergroup than the total amount + * of workgroups dispatched. + */ + if (wgs_per_sg > num_wgs) + return best_wgs_per_sg; + + /* Compute wasted lines for this configuration and keep track of the + * config with less waste. + */ + uint32_t unused_lanes = (16 - ((wgs_per_sg * wg_size) % 16)) & 0x0f; + if (unused_lanes == 0) + return wgs_per_sg; + + if (unused_lanes < best_unused_lanes) { + best_wgs_per_sg = wgs_per_sg; + best_unused_lanes = unused_lanes; + } + } + + return best_wgs_per_sg; +} diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h new file mode 100644 index 00000000000..fc304f55958 --- /dev/null +++ b/src/broadcom/common/v3d_util.h @@ -0,0 +1,36 @@ +/* + * Copyright © 2021 Raspberry Pi + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef V3D_UTIL_H +#define V3D_UTIL_H + +#include "common/v3d_device_info.h" + +uint32_t +v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, + bool has_tsy_barrier, + uint32_t threads, + uint32_t num_wgs, + uint32_t wg_size); + +#endif diff --git a/src/broadcom/meson.build b/src/broadcom/meson.build index f558aaca4e1..6b51a37e0b1 100644 --- a/src/broadcom/meson.build +++ b/src/broadcom/meson.build @@ -53,7 +53,7 @@ endforeach libbroadcom_v3d = static_library( 'libbroadcom_v3d', [ - files('common/v3d_debug.c', 'common/v3d_device_info.c', 'clif/clif_dump.c'), + files('common/v3d_debug.c', 'common/v3d_device_info.c', 'clif/clif_dump.c', 'common/v3d_util.c'), v3d_xml_pack, ], include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_broadcom], diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c index eee71610ce5..aadbdb85626 100644 --- a/src/broadcom/vulkan/v3dv_cmd_buffer.c +++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c @@ -5268,62 +5268,6 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job( } } -/* Choose a number of workgroups per supergroup that maximizes - * lane occupancy. We can pack up to 16 workgroups into a supergroup. - */ -static uint32_t -choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, - struct v3dv_shader_variant *cs, - uint32_t num_wgs, - uint32_t wg_size) -{ - /* Compute maximum number of batches in a supergroup for this workgroup size. - * Each batch is 16 elements, and we can have up to 16 work groups in a - * supergroup: - * - * max_batches_per_sg = (wg_size * max_wgs_per_sg) / elements_per_batch - * since max_wgs_per_sg = 16 and elements_per_batch = 16, we get: - * max_batches_per_sg = wg_size - */ - uint32_t max_batches_per_sg = wg_size; - - /* QPU threads will stall at TSY barriers until the entire supergroup - * reaches the barrier. Limit the supergroup size to half the QPU threads - * available, so we can have at least 2 supergroups executing in parallel - * and we don't stall all our QPU threads when a supergroup hits a barrier. - */ - if (cs->prog_data.cs->base.has_control_barrier) { - uint32_t max_qpu_threads = - devinfo->qpu_count * cs->prog_data.cs->base.threads; - max_batches_per_sg = MIN2(max_batches_per_sg, max_qpu_threads / 2); - } - uint32_t max_wgs_per_sg = max_batches_per_sg * 16 / wg_size; - - uint32_t best_wgs_per_sg = 1; - uint32_t best_unused_lanes = 16; - for (uint32_t wgs_per_sg = 1; wgs_per_sg <= max_wgs_per_sg; wgs_per_sg++) { - /* Don't try to pack more workgroups per supergroup than the total amount - * of workgroups dispatched. - */ - if (wgs_per_sg > num_wgs) - return best_wgs_per_sg; - - /* Compute wasted lines for this configuration and keep track of the - * config with less waste. - */ - uint32_t unused_lanes = (16 - ((wgs_per_sg * wg_size) % 16)) & 0x0f; - if (unused_lanes == 0) - return wgs_per_sg; - - if (unused_lanes < best_unused_lanes) { - best_wgs_per_sg = wgs_per_sg; - best_unused_lanes = unused_lanes; - } - } - - return best_wgs_per_sg; -} - static struct v3dv_job * cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, uint32_t group_count_x, @@ -5367,8 +5311,12 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, cpd->local_size[2]; uint32_t wgs_per_sg = - choose_workgroups_per_supergroup(&cmd_buffer->device->devinfo, - cs_variant, num_wgs, wg_size); + v3d_csd_choose_workgroups_per_supergroup( + &cmd_buffer->device->devinfo, + cs_variant->prog_data.cs->base.has_control_barrier, + cs_variant->prog_data.cs->base.threads, + num_wgs, wg_size); + uint32_t batches_per_sg = DIV_ROUND_UP(wgs_per_sg * wg_size, 16); uint32_t whole_sgs = num_wgs / wgs_per_sg; uint32_t rem_wgs = num_wgs - whole_sgs * wgs_per_sg; diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h index 703f0e53dae..2f466af7853 100644 --- a/src/broadcom/vulkan/v3dv_private.h +++ b/src/broadcom/vulkan/v3dv_private.h @@ -56,6 +56,7 @@ #include "common/v3d_device_info.h" #include "common/v3d_limits.h" +#include "common/v3d_util.h" #include "compiler/shader_enums.h" #include "compiler/spirv/nir_spirv.h"