Files
mesa/src/intel/vulkan/genX_blorp_exec.c
Lionel Landwerlin c478b6355a anv/blorp/iris: rework Wa_14025112257
Drivers already have to track this workaround, so remove the logic
from Blorp and let the driver manage this.

Also in Anv don't accumulate this workaround, emit it directly in
place right after COMPUTE_WALKER. Accumulating can be problematic when
you want to dispatch concurrent compute shaders that do not need any
cache flush interaction (typical example with the internal
simple_shader framework).

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Fixes: 3e0ad0176b ("anv: Emit state cache invalidation after every compute dispatch")
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38306>
2025-11-10 08:57:06 +00:00

581 lines
20 KiB
C

/*
* Copyright © 2016 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <assert.h>
#include "anv_private.h"
#include "anv_measure.h"
/* These are defined in anv_private.h and blorp_genX_exec_brw.h */
#undef __gen_address_type
#undef __gen_user_data
#undef __gen_combine_address
#include "common/intel_l3_config.h"
#include "blorp/blorp_genX_exec_brw.h"
#include "ds/intel_tracepoints.h"
static void blorp_measure_start(struct blorp_batch *_batch,
const struct blorp_params *params)
{
struct anv_cmd_buffer *cmd_buffer = _batch->driver_batch;
trace_intel_begin_blorp(&cmd_buffer->trace);
anv_measure_snapshot(cmd_buffer,
blorp_op_to_intel_measure_snapshot(params->op),
NULL, 0);
}
static void blorp_measure_end(struct blorp_batch *_batch,
const struct blorp_params *params)
{
struct anv_cmd_buffer *cmd_buffer = _batch->driver_batch;
trace_intel_end_blorp(&cmd_buffer->trace,
params->op,
params->x1 - params->x0,
params->y1 - params->y0,
params->num_samples,
params->shader_pipeline,
params->dst.view.format,
params->src.view.format,
(_batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
}
static void *
blorp_emit_dwords(struct blorp_batch *batch, unsigned n)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
return anv_batch_emit_dwords(&cmd_buffer->batch, n);
}
static uint64_t
blorp_emit_reloc(struct blorp_batch *batch,
void *location, struct blorp_address address, uint32_t delta)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
struct anv_address anv_addr = {
.bo = address.buffer,
.offset = address.offset,
};
anv_reloc_list_add_bo(cmd_buffer->batch.relocs, anv_addr.bo);
return anv_address_physical(anv_address_add(anv_addr, delta));
}
static void
blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
struct blorp_address address, uint32_t delta)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
VkResult result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
address.buffer);
if (unlikely(result != VK_SUCCESS))
anv_batch_set_error(&cmd_buffer->batch, result);
}
static uint64_t
blorp_get_surface_address(struct blorp_batch *blorp_batch,
struct blorp_address address)
{
struct anv_address anv_addr = {
.bo = address.buffer,
.offset = address.offset,
};
return anv_address_physical(anv_addr);
}
#if GFX_VER == 9
static struct blorp_address
blorp_get_surface_base_address(struct blorp_batch *batch)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
return (struct blorp_address) {
.buffer = cmd_buffer->device->internal_surface_state_pool.block_pool.bo,
.offset = -cmd_buffer->device->internal_surface_state_pool.start_offset,
};
}
#endif
static uint32_t
blorp_get_dynamic_state(struct blorp_batch *batch,
enum blorp_dynamic_state name)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
return cmd_buffer->device->blorp.dynamic_states[name].offset;
}
static void *
blorp_alloc_dynamic_state(struct blorp_batch *batch,
uint32_t size,
uint32_t alignment,
uint32_t *offset)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
struct anv_state state =
anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, alignment);
*offset = state.offset;
return state.map;
}
UNUSED static void *
blorp_alloc_general_state(struct blorp_batch *batch,
uint32_t size,
uint32_t alignment,
uint32_t *offset)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
struct anv_state state =
anv_cmd_buffer_alloc_general_state(cmd_buffer, size, alignment);
*offset = state.offset;
return state.map;
}
static bool
blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
unsigned state_size, unsigned state_alignment,
uint32_t *bt_offset,
uint32_t *surface_offsets, void **surface_maps)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
uint32_t state_offset;
struct anv_state bt_state;
VkResult result =
anv_cmd_buffer_alloc_blorp_binding_table(cmd_buffer, num_entries,
&state_offset, &bt_state);
if (result != VK_SUCCESS)
return false;
uint32_t *bt_map = bt_state.map;
*bt_offset = bt_state.offset;
for (unsigned i = 0; i < num_entries; i++) {
struct anv_state surface_state =
anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
if (surface_state.map == NULL)
return false;
bt_map[i] = surface_state.offset + state_offset;
surface_offsets[i] = surface_state.offset;
surface_maps[i] = surface_state.map;
}
return true;
}
static uint32_t
blorp_binding_table_offset_to_pointer(struct blorp_batch *batch,
uint32_t offset)
{
return offset;
}
static void *
blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
struct blorp_address *addr)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
struct anv_state vb_state =
anv_cmd_buffer_alloc_temporary_state(cmd_buffer, size, 64);
struct anv_address vb_addr =
anv_cmd_buffer_temporary_state_address(cmd_buffer, vb_state);
*addr = (struct blorp_address) {
.buffer = vb_addr.bo,
.offset = vb_addr.offset,
.mocs = isl_mocs(&cmd_buffer->device->isl_dev,
ISL_SURF_USAGE_VERTEX_BUFFER_BIT, false),
};
return vb_state.map;
}
static void
blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
const struct blorp_address *addrs,
uint32_t *sizes,
unsigned num_vbs)
{
#if GFX_VER == 9
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
for (unsigned i = 0; i < num_vbs; i++) {
struct anv_address anv_addr = {
.bo = addrs[i].buffer,
.offset = addrs[i].offset,
};
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,
i, anv_addr, sizes[i]);
}
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
/* Technically, we should call this *after* 3DPRIMITIVE but it doesn't
* really matter for blorp because we never call apply_pipe_flushes after
* this point.
*/
genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer, SEQUENTIAL,
(1 << num_vbs) - 1);
#endif
}
UNUSED static struct blorp_address
blorp_get_workaround_address(struct blorp_batch *batch)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
return (struct blorp_address) {
.buffer = cmd_buffer->device->workaround_address.bo,
.offset = cmd_buffer->device->workaround_address.offset,
};
}
static void
blorp_flush_range(struct blorp_batch *batch, void *start, size_t size)
{
/* We don't need to flush states anymore, since everything will be snooped.
*/
}
static void
blorp_pre_emit_urb_config(struct blorp_batch *blorp_batch,
struct intel_urb_config *urb_cfg)
{
#if INTEL_NEEDS_WA_16014912113
struct anv_cmd_buffer *cmd_buffer = blorp_batch->driver_batch;
if (genX(need_wa_16014912113)(
&cmd_buffer->state.gfx.urb_cfg, urb_cfg)) {
genX(batch_emit_wa_16014912113)(&cmd_buffer->batch,
&cmd_buffer->state.gfx.urb_cfg);
}
/* Update urb config. */
memcpy(&cmd_buffer->state.gfx.urb_cfg, urb_cfg, sizeof(*urb_cfg));
#endif
}
static const struct intel_l3_config *
blorp_get_l3_config(struct blorp_batch *batch)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
return cmd_buffer->state.current_l3_config;
}
static void
blorp_exec_on_render(struct blorp_batch *batch,
const struct blorp_params *params)
{
assert((batch->flags & BLORP_BATCH_USE_COMPUTE) == 0);
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
assert(cmd_buffer->queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT);
struct anv_gfx_dynamic_state *hw_state =
&cmd_buffer->state.gfx.dyn_state;
const unsigned scale = params->fast_clear_op ? UINT_MAX : 1;
genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, params->x1 - params->x0,
params->y1 - params->y0, scale);
#if GFX_VER >= 11
/* The PIPE_CONTROL command description says:
*
* "Whenever a Binding Table Index (BTI) used by a Render Target Message
* points to a different RENDER_SURFACE_STATE, SW must issue a Render
* Target Cache Flush by enabling this bit. When render target flush
* is set due to new association of BTI, PS Scoreboard Stall bit must
* be set in this packet."
*/
if (blorp_uses_bti_rt_writes(batch, params)) {
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
"before blorp BTI change");
}
#endif
#if INTEL_WA_18019816803_GFX_VER
/* Check if blorp ds state matches ours. */
if (intel_needs_workaround(cmd_buffer->device->info, 18019816803)) {
bool blorp_ds_state = params->depth.enabled || params->stencil.enabled;
if (hw_state->ds_write_state != blorp_ds_state) {
/* Flag the change in ds_write_state so that the next pipeline use
* will trigger a PIPE_CONTROL too.
*/
hw_state->ds_write_state = blorp_ds_state;
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_WA_18019816803);
/* Add the stall that will flush prior to the blorp operation by
* genX(cmd_buffer_apply_pipe_flushes)
*/
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_PSS_STALL_SYNC_BIT,
"Wa_18019816803");
}
}
#endif
#if INTEL_WA_14018283232_GFX_VER
genX(cmd_buffer_ensure_wa_14018283232)(cmd_buffer, false);
#endif
#if INTEL_WA_18038825448_GFX_VER
if (genX(cmd_buffer_set_coarse_pixel_active)
(cmd_buffer, ANV_COARSE_PIXEL_STATE_DISABLED)) {
batch->flags |= BLORP_BATCH_FORCE_CPS_DEPENDENCY;
}
#endif
if (params->depth.enabled &&
!(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, &params->depth.surf);
genX(flush_pipeline_select_3d)(cmd_buffer);
/* Wa_14015814527 */
genX(apply_task_urb_workaround)(cmd_buffer);
/* Apply any outstanding flushes in case pipeline select haven't. */
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
/* BLORP doesn't do anything fancy with depth such as discards, so we want
* the PMA fix off. Also, off is always the safe option.
*/
genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
blorp_exec(batch, params);
#if GFX_VER >= 11
/* The PIPE_CONTROL command description says:
*
* "Whenever a Binding Table Index (BTI) used by a Render Target Message
* points to a different RENDER_SURFACE_STATE, SW must issue a Render
* Target Cache Flush by enabling this bit. When render target flush
* is set due to new association of BTI, PS Scoreboard Stall bit must
* be set in this packet."
*/
if (blorp_uses_bti_rt_writes(batch, params)) {
anv_add_pending_pipe_bits(cmd_buffer,
ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
"after blorp BTI change");
}
#endif
/* Flag all the instructions emitted by BLORP. */
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_URB);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VF_STATISTICS);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VF);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VF_TOPOLOGY);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VERTEX_INPUT);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VF_SGVS);
#if GFX_VER >= 11
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VF_SGVS_2);
#endif
#if GFX_VER >= 12
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
#endif
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VIEWPORT_CC);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_STREAMOUT);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_RASTER);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_CLIP);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_SAMPLE_MASK);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_MULTISAMPLE);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_SF);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_SBE);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_SBE_SWIZ);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_DEPTH_BOUNDS);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_WM);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_WM_DEPTH_STENCIL);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_VS);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_HS);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_DS);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_TE);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_GS);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_PS);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_PS_EXTRA);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_BLEND_STATE);
if (batch->blorp->config.use_mesh_shading) {
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_MESH_CONTROL);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_TASK_CONTROL);
}
if (params->wm_prog_data) {
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_CC_STATE);
BITSET_SET(hw_state->emit_dirty, ANV_GFX_STATE_PS_BLEND);
}
anv_cmd_dirty_mask_t dirty = ~(ANV_CMD_DIRTY_INDEX_BUFFER |
ANV_CMD_DIRTY_XFB_ENABLE |
ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE |
ANV_CMD_DIRTY_INDEX_TYPE);
cmd_buffer->state.gfx.vb_dirty = ~0;
cmd_buffer->state.gfx.dirty |= dirty;
cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
}
static void
blorp_exec_on_compute(struct blorp_batch *batch,
const struct blorp_params *params)
{
assert(batch->flags & BLORP_BATCH_USE_COMPUTE);
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
assert(cmd_buffer->queue_family->queueFlags & VK_QUEUE_COMPUTE_BIT);
genX(flush_pipeline_select_gpgpu)(cmd_buffer);
/* Apply any outstanding flushes in case pipeline select haven't. */
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
blorp_exec(batch, params);
cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
cmd_buffer->state.compute.pipeline_dirty = true;
genX(cmd_buffer_post_dispatch_wa)(cmd_buffer);
}
static void
blorp_exec_on_blitter(struct blorp_batch *batch,
const struct blorp_params *params)
{
assert(batch->flags & BLORP_BATCH_USE_BLITTER);
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
assert(anv_cmd_buffer_is_blitter_queue(cmd_buffer));
blorp_exec(batch, params);
}
static enum isl_aux_op
get_color_aux_op(const struct blorp_params *params)
{
switch (params->op) {
case BLORP_OP_CCS_RESOLVE:
case BLORP_OP_CCS_PARTIAL_RESOLVE:
case BLORP_OP_CCS_COLOR_CLEAR:
case BLORP_OP_MCS_COLOR_CLEAR:
assert(params->fast_clear_op != ISL_AUX_OP_NONE);
return params->fast_clear_op;
/* Some auxiliary surface operations are not provided by hardware. To
* provide that functionality, BLORP sometimes tries to emulate what
* hardware would do with custom pixel shaders. For now, we assume that
* BLORP's implementation has the same cache invalidation and flushing
* requirements as similar hardware operations.
*/
case BLORP_OP_CCS_AMBIGUATE:
assert(GFX_VER >= 11 || params->fast_clear_op == ISL_AUX_OP_NONE);
return ISL_AUX_OP_AMBIGUATE;
case BLORP_OP_MCS_AMBIGUATE:
assert(params->fast_clear_op == ISL_AUX_OP_NONE);
return ISL_AUX_OP_AMBIGUATE;
case BLORP_OP_MCS_PARTIAL_RESOLVE:
assert(params->fast_clear_op == ISL_AUX_OP_NONE);
return ISL_AUX_OP_PARTIAL_RESOLVE;
/* If memory aliasing is being done on an image, a pending fast clear
* could hit the destination address at an unknown time. Go back to the
* regular drawing mode to avoid this case.
*/
case BLORP_OP_HIZ_AMBIGUATE:
case BLORP_OP_HIZ_CLEAR:
case BLORP_OP_HIZ_RESOLVE:
case BLORP_OP_SLOW_DEPTH_CLEAR:
assert(params->fast_clear_op == ISL_AUX_OP_NONE);
return ISL_AUX_OP_NONE;
/* The remaining operations are considered regular draws. */
case BLORP_OP_SLOW_COLOR_CLEAR:
case BLORP_OP_BLIT:
case BLORP_OP_COPY:
assert(params->fast_clear_op == ISL_AUX_OP_NONE);
return ISL_AUX_OP_NONE;
}
UNREACHABLE("Invalid value in params->op");
}
void
genX(blorp_exec)(struct blorp_batch *batch,
const struct blorp_params *params)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
/* Turn on preemption if it was toggled off. */
if (!cmd_buffer->state.gfx.object_preemption)
genX(cmd_buffer_set_preemption)(cmd_buffer, true);
if (!cmd_buffer->state.current_l3_config) {
const struct intel_l3_config *cfg =
intel_get_default_l3_config(cmd_buffer->device->info);
genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
}
/* Flush any in-progress CCS/MCS operations as needed. */
const enum isl_aux_op aux_op = get_color_aux_op(params);
genX(cmd_buffer_update_color_aux_op(cmd_buffer, aux_op));
if (batch->flags & BLORP_BATCH_USE_BLITTER)
blorp_exec_on_blitter(batch, params);
else if (batch->flags & BLORP_BATCH_USE_COMPUTE)
blorp_exec_on_compute(batch, params);
else
blorp_exec_on_render(batch, params);
}
static void
blorp_emit_pre_draw(struct blorp_batch *batch, const struct blorp_params *params)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
blorp_measure_start(batch, params);
genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
}
static void
blorp_emit_post_draw(struct blorp_batch *batch, const struct blorp_params *params)
{
struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
cmd_buffer->device,
_3DPRIM_RECTLIST,
3);
genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
blorp_measure_end(batch, params);
}
void
genX(blorp_init_dynamic_states)(struct blorp_context *context)
{
blorp_init_dynamic_states(context);
}