From 423d472a4ec80ba12ff0e199d511e8193cab815a Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Fri, 1 Nov 2024 14:44:40 +0000 Subject: [PATCH] tu: Re-emit visibility stream before each render pass When we set the visibility stream with CP_SET_PSEUDO_REG, it does two things (or only one of the two, with concurrent binning): - Set the "pseudo register" used by CP_SET_BIN_DATA5_OFFSET, which in turn is used when decoding the vis. streams. - Set the VSC register used by the binning pass. Preemption with skipsaverestore obliterates the second, but not the first. This means that before running the binning pass, we have to re-emit these registers. I *think* this is what the blob does on a7xx. On a6xx, where the pseudo register doesn't exist, the blob seems to re-emit the preamble every time we re-allocate the visibility streams, but we don't support a6xx yet so we can defer making that decision. Fixes supertuxkart under zink with preemption enabled in the kernel. Fixes: 1d2b479a3b3 ("tu: Allow being preempted on a7xx") Part-of: --- src/freedreno/vulkan/tu_cmd_buffer.cc | 42 ++++++++++++++++++++------- src/freedreno/vulkan/tu_cmd_buffer.h | 1 + 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 13929a39fb9..18777d52299 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -98,9 +98,8 @@ tu6_lazy_emit_tessfactor_addr(struct tu_cmd_buffer *cmd) cmd->state.tessfactor_addr_set = true; } -template static void -tu6_lazy_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +tu6_lazy_init_vsc(struct tu_cmd_buffer *cmd) { struct tu_device *dev = cmd->device; uint32_t num_vsc_pipes = dev->physical_device->info->num_vsc_pipes; @@ -136,23 +135,30 @@ tu6_lazy_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_get_scratch_bo(dev, size0 + num_vsc_pipes * 4, &vsc_bo); + cmd->vsc_draw_strm_va = vsc_bo->iova + cmd->vsc_prim_strm_pitch * num_vsc_pipes; + cmd->vsc_draw_strm_size_va = vsc_bo->iova + size0; + cmd->vsc_prim_strm_va = vsc_bo->iova; +} + +template +static void +tu_emit_vsc(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ if (CHIP == A6XX) { tu_cs_emit_regs(cs, - A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = vsc_bo, .bo_offset = size0)); + A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.qword = cmd->vsc_draw_strm_size_va)); tu_cs_emit_regs(cs, - A6XX_VSC_PRIM_STRM_ADDRESS(.bo = vsc_bo)); + A6XX_VSC_PRIM_STRM_ADDRESS(.qword = cmd->vsc_prim_strm_va)); tu_cs_emit_regs( - cs, A6XX_VSC_DRAW_STRM_ADDRESS(.bo = vsc_bo, - .bo_offset = cmd->vsc_prim_strm_pitch * - num_vsc_pipes)); + cs, A6XX_VSC_DRAW_STRM_ADDRESS(.qword = cmd->vsc_draw_strm_va)); } else { tu_cs_emit_pkt7(cs, CP_SET_PSEUDO_REG, 3 * 3); tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(DRAW_STRM_ADDRESS)); - tu_cs_emit_qw(cs, vsc_bo->iova + cmd->vsc_prim_strm_pitch * num_vsc_pipes); + tu_cs_emit_qw(cs, cmd->vsc_draw_strm_va); tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(DRAW_STRM_SIZE_ADDRESS)); - tu_cs_emit_qw(cs, vsc_bo->iova + size0); + tu_cs_emit_qw(cs, cmd->vsc_draw_strm_size_va); tu_cs_emit(cs, A6XX_CP_SET_PSEUDO_REG__0_PSEUDO_REG(PRIM_STRM_ADDRESS)); - tu_cs_emit_qw(cs, vsc_bo->iova); + tu_cs_emit_qw(cs, cmd->vsc_prim_strm_va); } cmd->vsc_initialized = true; @@ -2121,9 +2127,23 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, if (use_hw_binning(cmd)) { if (!cmd->vsc_initialized) { - tu6_lazy_emit_vsc(cmd, cs); + tu6_lazy_init_vsc(cmd); } + /* We always emit VSC before each renderpass, because due to + * skipsaverestore the underlying VSC registers may have become + * invalid. Normally we'd need to WFI before setting these non-context + * registers, but we should be safe because we're only setting it to the + * same value it had before. + * + * TODO: On a6xx, we have to emit this per-bin or make the amble include + * these registers, because CP_SET_BIN_DATA5_OFFSET will use the + * register instead of the pseudo register and its value won't survive + * across preemptions. The blob seems to take the second approach and + * emits the preamble lazily. + */ + tu_emit_vsc(cmd, cs); + tu6_emit_bin_size(cs, tiling->tile0.width, tiling->tile0.height, { .render_mode = BINNING_PASS, diff --git a/src/freedreno/vulkan/tu_cmd_buffer.h b/src/freedreno/vulkan/tu_cmd_buffer.h index a9ff21e6879..669b8a0efc8 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.h +++ b/src/freedreno/vulkan/tu_cmd_buffer.h @@ -611,6 +611,7 @@ struct tu_cmd_buffer uint32_t vsc_draw_strm_pitch; uint32_t vsc_prim_strm_pitch; + uint64_t vsc_draw_strm_va, vsc_draw_strm_size_va, vsc_prim_strm_va; bool vsc_initialized; }; VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,