From 3f3fa5ee0c30a35e0f9b7bbcae287151f05e57a2 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Fri, 10 Nov 2023 10:49:45 -0800 Subject: [PATCH] freedreno/a6xx: Rework wave input size Rework to match tu. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/7464 Signed-off-by: Rob Clark Part-of: --- src/freedreno/ci/freedreno-a660-fails.txt | 1 - .../drivers/freedreno/a6xx/fd6_program.cc | 69 +++++++++---------- 2 files changed, 33 insertions(+), 37 deletions(-) diff --git a/src/freedreno/ci/freedreno-a660-fails.txt b/src/freedreno/ci/freedreno-a660-fails.txt index ef106528a08..7dca9f1aada 100644 --- a/src/freedreno/ci/freedreno-a660-fails.txt +++ b/src/freedreno/ci/freedreno-a660-fails.txt @@ -3,7 +3,6 @@ KHR-GL46.gpu_shader_fp64.fp64.max_uniform_components,Fail KHR-GL46.multi_bind.dispatch_bind_image_textures,Fail KHR-GL46.shader_image_load_store.basic-allTargets-store,Fail KHR-GL46.shader_subroutine.control_flow_and_returned_subroutine_values_used_as_subroutine_input,Fail -KHR-GL46.tessellation_shader.single.max_patch_vertices,Fail # Fails when TU_DEBUG=forcebin is set gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_vert,Fail diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.cc b/src/gallium/drivers/freedreno/a6xx/fd6_program.cc index 53aee8b7a7e..4ee1852b757 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.cc @@ -1101,46 +1101,43 @@ setup_stateobj(struct fd_ringbuffer *ring, const struct program_builder *b) } if (b->hs) { + uint32_t patch_control_points = b->key->patch_vertices; + + uint32_t patch_local_mem_size_16b = + patch_control_points * b->vs->output_size / 4; + + /* Total attribute slots in HS incoming patch. */ + OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1); + OUT_RING(ring, patch_local_mem_size_16b); + + const uint32_t wavesize = 64; + const uint32_t vs_hs_local_mem_size = 16384; + + uint32_t max_patches_per_wave; if (b->ctx->screen->info->a6xx.tess_use_shared) { - unsigned hs_input_size = 6 + (3 * (b->vs->output_size - 1)); - unsigned wave_input_size = - MIN2(64, DIV_ROUND_UP(hs_input_size * 4, - b->hs->tess.tcs_vertices_out)); - - OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1); - OUT_RING(ring, hs_input_size); - - OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); - OUT_RING(ring, wave_input_size); + /* HS invocations for a patch are always within the same wave, + * making barriers less expensive. VS can't have barriers so we + * don't care about VS invocations being in the same wave. + */ + max_patches_per_wave = wavesize / b->hs->tess.tcs_vertices_out; } else { - uint32_t hs_input_size = - b->hs->tess.tcs_vertices_out * b->vs->output_size / 4; - - /* Total attribute slots in HS incoming patch. */ - OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1); - OUT_RING(ring, hs_input_size); - - const uint32_t wavesize = 64; - const uint32_t max_wave_input_size = 64; - const uint32_t patch_control_points = b->hs->tess.tcs_vertices_out; - - /* note: if HS is really just the VS extended, then this - * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out) - * however that doesn't match the blob, and fails some dEQP tests. - */ - uint32_t prims_per_wave = wavesize / b->hs->tess.tcs_vertices_out; - uint32_t max_prims_per_wave = max_wave_input_size * wavesize / - (b->vs->output_size * patch_control_points); - prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave); - - uint32_t total_size = - b->vs->output_size * patch_control_points * prims_per_wave; - uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize); - - OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); - OUT_RING(ring, wave_input_size); + /* VS is also in the same wave */ + max_patches_per_wave = + wavesize / MAX2(patch_control_points, + b->hs->tess.tcs_vertices_out); } + + uint32_t patches_per_wave = + MIN2(vs_hs_local_mem_size / (patch_local_mem_size_16b * 16), + max_patches_per_wave); + + uint32_t wave_input_size = DIV_ROUND_UP( + patches_per_wave * patch_local_mem_size_16b * 16, 256); + + OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); + OUT_RING(ring, wave_input_size); + enum a6xx_tess_output output; if (b->ds->tess.point_mode) output = TESS_POINTS;