diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index 88d6a3d4b50..85bf1f509bb 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -1382,8 +1382,17 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs); + /* Primitives that passed all tests are still counted in in each + * tile even with HW binning beforehand. Do not permit it. + */ + if (cmd->state.prim_generated_query_running_before_rp) + tu6_emit_event_write(cmd, cs, STOP_PRIMITIVE_CTRS); + tu_cs_emit_call(cs, &cmd->draw_cs); + if (cmd->state.prim_generated_query_running_before_rp) + tu6_emit_event_write(cmd, cs, START_PRIMITIVE_CTRS); + if (use_hw_binning(cmd)) { tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1); tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS)); @@ -1747,6 +1756,9 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, } else if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { assert(pBeginInfo->pInheritanceInfo); + cmd_buffer->inherited_pipeline_statistics = + pBeginInfo->pInheritanceInfo->pipelineStatistics; + vk_foreach_struct(ext, pBeginInfo->pInheritanceInfo) { switch (ext->sType) { case VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT: { diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index 9c09bbd647d..45688837f66 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -1868,6 +1868,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice, struct tu6_global *global = device->global_bo->map; tu_init_clear_blit_shaders(device); global->predicate = 0; + global->vtx_stats_query_not_running = 1; global->dbg_one = (uint32_t)-1; global->dbg_gmem_total_loads = 0; global->dbg_gmem_taken_loads = 0; diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 38735010d83..c3dbd7d795b 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -484,6 +484,8 @@ struct tu6_global ALIGN16 uint32_t cs_indirect_xyz[3]; + volatile uint32_t vtx_stats_query_not_running; + /* To know when renderpass stats for autotune are valid */ volatile uint32_t autotune_fence; @@ -1258,6 +1260,12 @@ struct tu_cmd_state */ uint32_t drawcall_bandwidth_per_sample_sum; + /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and + * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously, + * but they use the same {START,STOP}_PRIMITIVE_CTRS control. + */ + uint32_t prim_counters_running; + bool prim_generated_query_running_before_rp; bool has_prim_generated_query_in_rp; @@ -1304,6 +1312,8 @@ struct tu_cmd_buffer VkCommandBufferUsageFlags usage_flags; enum tu_cmd_buffer_status status; + VkQueryPipelineStatisticFlags inherited_pipeline_statistics; + struct tu_cmd_state state; uint32_t queue_family_index; diff --git a/src/freedreno/vulkan/tu_query.c b/src/freedreno/vulkan/tu_query.c index 20d5ee40294..ab003a276e3 100644 --- a/src/freedreno/vulkan/tu_query.c +++ b/src/freedreno/vulkan/tu_query.c @@ -426,9 +426,9 @@ statistics_index(uint32_t *statistics) } static bool -is_pipeline_query_with_vertex_stage(struct tu_query_pool *pool) +is_pipeline_query_with_vertex_stage(uint32_t pipeline_statistics) { - return pool->pipeline_statistics & + return pipeline_statistics & (VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT | VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT | VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT | @@ -441,16 +441,16 @@ is_pipeline_query_with_vertex_stage(struct tu_query_pool *pool) } static bool -is_pipeline_query_with_fragment_stage(struct tu_query_pool *pool) +is_pipeline_query_with_fragment_stage(uint32_t pipeline_statistics) { - return pool->pipeline_statistics & + return pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT; } static bool -is_pipeline_query_with_compute_stage(struct tu_query_pool *pool) +is_pipeline_query_with_compute_stage(uint32_t pipeline_statistics) { - return pool->pipeline_statistics & + return pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT; } @@ -871,15 +871,35 @@ emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf, struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin); - if (is_pipeline_query_with_vertex_stage(pool)) { + if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) { + bool need_cond_exec = cmdbuf->state.pass && cmdbuf->state.prim_counters_running; + cmdbuf->state.prim_counters_running++; + + /* Prevent starting primitive counters when it is supposed to be stopped + * for outer VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT query. + */ + if (need_cond_exec) { + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | + CP_COND_REG_EXEC_0_SYSMEM | + CP_COND_REG_EXEC_0_BINNING); + } + tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS); + + tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); + tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); + tu_cs_emit(cs, 0); + + if (need_cond_exec) { + tu_cond_exec_end(cs); + } } - if (is_pipeline_query_with_fragment_stage(pool)) { + if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) { tu6_emit_event_write(cmdbuf, cs, START_FRAGMENT_CTRS); } - if (is_pipeline_query_with_compute_stage(pool)) { + if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) { tu6_emit_event_write(cmdbuf, cs, START_COMPUTE_CTRS); } @@ -1008,6 +1028,17 @@ emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf, cmdbuf->state.prim_generated_query_running_before_rp = true; } + cmdbuf->state.prim_counters_running++; + + if (cmdbuf->state.pass) { + /* Primitives that passed all tests are still counted in in each + * tile even with HW binning beforehand. Do not permit it. + */ + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | + CP_COND_REG_EXEC_0_SYSMEM | + CP_COND_REG_EXEC_0_BINNING); + } + tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS); tu_cs_emit_wfi(cs); @@ -1017,6 +1048,10 @@ emit_begin_prim_generated_query(struct tu_cmd_buffer *cmdbuf, CP_REG_TO_MEM_0_CNT(2) | CP_REG_TO_MEM_0_64B); tu_cs_emit_qw(cs, begin_iova); + + if (cmdbuf->state.pass) { + tu_cond_exec_end(cs); + } } VKAPI_ATTR void VKAPI_CALL @@ -1152,6 +1187,53 @@ emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf, tu_cs_emit_qw(cs, 0x1); } +/* PRIMITIVE_CTRS is used for two distinct queries: + * - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT + * - VK_QUERY_TYPE_PIPELINE_STATISTICS + * If one is nested inside other - STOP_PRIMITIVE_CTRS should be emitted + * only for outer query. + * + * Also, pipeline stat query could run outside of renderpass and prim gen + * query inside of secondary cmd buffer - for such case we ought to track + * the status of pipeline stats query. + */ +static void +emit_stop_primitive_ctrs(struct tu_cmd_buffer *cmdbuf, + struct tu_cs *cs, + enum VkQueryType query_type) +{ + bool is_secondary = cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY; + cmdbuf->state.prim_counters_running--; + if (cmdbuf->state.prim_counters_running == 0) { + bool need_cond_exec = + is_secondary && + query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT && + is_pipeline_query_with_vertex_stage(cmdbuf->inherited_pipeline_statistics); + + if (!need_cond_exec) { + tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS); + } else { + tu_cs_reserve(cs, 7 + 2); + /* Check that pipeline stats query is not running, only then + * we count stop the counter. + */ + tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6); + tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); + tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); + tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2)); + tu_cs_emit(cs, 2); /* Cond execute the next 2 DWORDS */ + + tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS); + } + } + + if (query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { + tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3); + tu_cs_emit_qw(cs, global_iova(cmdbuf, vtx_stats_query_not_running)); + tu_cs_emit(cs, 1); + } +} + static void emit_end_stat_query(struct tu_cmd_buffer *cmdbuf, struct tu_query_pool *pool, @@ -1164,15 +1246,19 @@ emit_end_stat_query(struct tu_cmd_buffer *cmdbuf, uint64_t stat_start_iova; uint64_t stat_stop_iova; - if (is_pipeline_query_with_vertex_stage(pool)) { - tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS); + if (is_pipeline_query_with_vertex_stage(pool->pipeline_statistics)) { + /* No need to conditionally execute STOP_PRIMITIVE_CTRS when + * we are inside VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT inside of a + * renderpass, because it is already stopped. + */ + emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PIPELINE_STATISTICS); } - if (is_pipeline_query_with_fragment_stage(pool)) { + if (is_pipeline_query_with_fragment_stage(pool->pipeline_statistics)) { tu6_emit_event_write(cmdbuf, cs, STOP_FRAGMENT_CTRS); } - if (is_pipeline_query_with_compute_stage(pool)) { + if (is_pipeline_query_with_compute_stage(pool->pipeline_statistics)) { tu6_emit_event_write(cmdbuf, cs, STOP_COMPUTE_CTRS); } @@ -1355,7 +1441,11 @@ emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf, uint64_t result_iova = primitives_generated_query_iova(pool, query, result); uint64_t available_iova = query_available_iova(pool, query); - tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS); + if (cmdbuf->state.pass) { + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | + CP_COND_REG_EXEC_0_SYSMEM | + CP_COND_REG_EXEC_0_BINNING); + } tu_cs_emit_wfi(cs); @@ -1375,6 +1465,15 @@ emit_end_prim_generated_query(struct tu_cmd_buffer *cmdbuf, tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); + /* Should be after waiting for mem writes to have up to date info + * about which query is running. + */ + emit_stop_primitive_ctrs(cmdbuf, cs, VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT); + + if (cmdbuf->state.pass) { + tu_cond_exec_end(cs); + } + if (cmdbuf->state.pass) cs = &cmdbuf->draw_epilogue_cs; diff --git a/src/gallium/drivers/zink/ci/zink-tu-a630-flakes.txt b/src/gallium/drivers/zink/ci/zink-tu-a630-flakes.txt index 8d342317bd0..268c4c5dc19 100644 --- a/src/gallium/drivers/zink/ci/zink-tu-a630-flakes.txt +++ b/src/gallium/drivers/zink/ci/zink-tu-a630-flakes.txt @@ -1,6 +1,3 @@ -# #6603 -GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_states - GTF-GL46.gtf32.GL3Tests.packed_pixels.packed_pixels_pixelstore KHR-Single-GL46.arrays_of_arrays_gl.ConstructorsAndUnsizedDeclConstructorSizing1 dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_write_dynamic_read_vertex