broadcom/common: Optimize CSD super-group packing
Return one work group per super group when the work group size is multiple of 16 (elements per batch) and recalculate max_wgs_per_sg only when TSY barriers cut the available QPU threads. Reviewed-by: Iago Toral Quiroga <itoral@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37836>
This commit is contained in:
committed by
Iago Toral Quiroga
parent
099ac5be1a
commit
1326d52d23
@@ -41,6 +41,12 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
|
||||
if (has_subgroups)
|
||||
return 1;
|
||||
|
||||
/* If the workgroup size is a multiple of 16 (elements per batch),
|
||||
* the lane occupancy is already maximized.
|
||||
*/
|
||||
if (wg_size % 16 == 0)
|
||||
return 1;
|
||||
|
||||
/* Compute maximum number of batches in a supergroup for this workgroup size.
|
||||
* Each batch is 16 elements, and we can have up to 16 work groups in a
|
||||
* supergroup:
|
||||
@@ -56,11 +62,13 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
|
||||
* available, so we can have at least 2 supergroups executing in parallel
|
||||
* and we don't stall all our QPU threads when a supergroup hits a barrier.
|
||||
*/
|
||||
uint32_t max_wgs_per_sg = 16;
|
||||
|
||||
if (has_tsy_barrier) {
|
||||
uint32_t max_qpu_threads = devinfo->qpu_count * threads;
|
||||
max_batches_per_sg = MIN2(max_batches_per_sg, max_qpu_threads / 2);
|
||||
max_wgs_per_sg = max_batches_per_sg * 16 / wg_size;
|
||||
}
|
||||
uint32_t max_wgs_per_sg = max_batches_per_sg * 16 / wg_size;
|
||||
|
||||
uint32_t best_wgs_per_sg = 1;
|
||||
uint32_t best_unused_lanes = 16;
|
||||
|
||||
Reference in New Issue
Block a user