tu: Disable FLAG_WAIT_FOR_BR sync when CB is disabled

Skip TU_CMD_FLAG_WAIT_FOR_BR wait whenever concurrent binning is disabled.
Without CB there is nothing to wait for, so the sync only adds overhead,
and in workloads with thousands of tiny renderpasses the cumulative overhead
becomes too big.

In one real-world workload I saw the following timings:
- 99.20 ms without disabling TU_CMD_FLAG_WAIT_FOR_BR
- 65.15 ms with TU_CMD_FLAG_WAIT_FOR_BR disabled
- 64.92 ms with TU_DEBUG=nocb

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38378>
This commit is contained in:
Danylo Piliaiev
2025-11-11 14:54:46 +01:00
committed by Marge Bot
parent 9370bdc61e
commit 8827123fef
+19 -2
View File
@@ -411,9 +411,15 @@ tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer)
tu6_emit_flushes<CHIP>(cmd_buffer, cs, cache);
if ((flushes & TU_CMD_FLAG_WAIT_FOR_BR) && CHIP >= A7XX &&
!(cmd_buffer->state.pass && cmd_buffer->state.renderpass_cb_disabled)) {
!(cmd_buffer->state.pass && cmd_buffer->state.renderpass_cb_disabled) &&
!TU_DEBUG(NO_CONCURRENT_BINNING)) {
trace_start_concurrent_binning_barrier(&cmd_buffer->trace, cs, cmd_buffer);
/* Wait-for-BR when repeated a lot of times per frame can add up
* and tank performance.
*/
struct tu_cs_patchable_state cb_state = tu_cs_patchable_start(cs, 64);
tu_cs_emit_pkt7(cs, CP_THREAD_CONTROL, 1);
tu_cs_emit(cs, CP_THREAD_CONTROL_0_THREAD(CP_SET_THREAD_BOTH));
@@ -453,8 +459,19 @@ tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer)
*/
tu7_wait_onchip_val(cs, TU_ONCHIP_BARRIER, 0);
tu7_thread_control(cs, CP_SET_THREAD_BR);
tu_cs_patchable_end(cs, false, &cb_state);
tu_add_cb_barrier_info(cmd_buffer);
tu7_set_thread_br_patchpoint(cmd_buffer, cs, false);
struct tu_cb_control_point cb_patch = {
.type = TU_CB_CONTROL_TYPE_PATCHPOINT,
.patchpoint = cb_state.nop_header,
.patch_value = cb_state.enable_patch,
.original_value = cb_state.disable_patch,
};
util_dynarray_append(&cmd_buffer->cb_control_points, cb_patch);
trace_end_concurrent_binning_barrier(&cmd_buffer->trace, cs);
}