radeonsi: do not do two full flushes on every compute dispatch
v2: Add more CS_PARTIAL_FLUSH events. Essentially every place with waits on finishing for pixel shaders also has a write after read hazard with compute shaders. Invalidating L2 waits implicitly on pixel and compute shaders, so, we don't need a CS_PARTIAL_FLUSH for switching FBO. v3: Add CS_PARTIAL_FLUSH events even if we already have INV_GLOBAL_L2. According to Marek the INV_GLOBAL_L2 events don't wait for compute shaders to finish, so wait for them explicitly. Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> Reviewed-by: Marek Olšák <marek.olsak@amd.com> Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com> Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
This commit is contained in:
@@ -441,13 +441,8 @@ static void si_launch_grid(
|
||||
if (!sctx->cs_shader_state.initialized)
|
||||
si_initialize_compute(sctx);
|
||||
|
||||
sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
|
||||
SI_CONTEXT_INV_GLOBAL_L2 |
|
||||
SI_CONTEXT_INV_ICACHE |
|
||||
SI_CONTEXT_INV_SMEM_L1 |
|
||||
SI_CONTEXT_FLUSH_WITH_INV_L2 |
|
||||
SI_CONTEXT_FLAG_COMPUTE;
|
||||
si_emit_cache_flush(sctx, NULL);
|
||||
if (sctx->b.flags)
|
||||
si_emit_cache_flush(sctx, NULL);
|
||||
|
||||
if (!si_switch_compute_shader(sctx, program, &program->shader, info->pc))
|
||||
return;
|
||||
@@ -480,14 +475,6 @@ static void si_launch_grid(
|
||||
si_setup_tgsi_grid(sctx, info);
|
||||
|
||||
si_emit_dispatch_packets(sctx, info);
|
||||
|
||||
sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_INV_VMEM_L1 |
|
||||
SI_CONTEXT_INV_GLOBAL_L2 |
|
||||
SI_CONTEXT_INV_ICACHE |
|
||||
SI_CONTEXT_INV_SMEM_L1 |
|
||||
SI_CONTEXT_FLAG_COMPUTE;
|
||||
si_emit_cache_flush(sctx, NULL);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
|
||||
uint64_t va = r600_resource(dst)->gpu_address + offset;
|
||||
|
||||
/* Flush the caches. */
|
||||
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
|
||||
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
|
||||
|
||||
while (size) {
|
||||
unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
|
||||
@@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx,
|
||||
}
|
||||
|
||||
/* Flush the caches. */
|
||||
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
|
||||
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
|
||||
|
||||
/* This is the main part doing the copying. Src is always aligned. */
|
||||
main_dst_offset = dst_offset + skipped_size;
|
||||
|
||||
@@ -1032,7 +1032,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
|
||||
* start writing to the targets.
|
||||
*/
|
||||
if (num_targets)
|
||||
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
|
||||
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
|
||||
/* Streamout buffers must be bound in 2 places:
|
||||
* 1) in VGT by setting the VGT_STRMOUT registers
|
||||
|
||||
@@ -117,6 +117,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
|
||||
ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
|
||||
SI_CONTEXT_INV_VMEM_L1 |
|
||||
SI_CONTEXT_INV_GLOBAL_L2 |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH |
|
||||
/* this is probably not needed anymore */
|
||||
SI_CONTEXT_PS_PARTIAL_FLUSH;
|
||||
si_emit_cache_flush(ctx, NULL);
|
||||
|
||||
@@ -2436,7 +2436,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
|
||||
*/
|
||||
sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
|
||||
SI_CONTEXT_INV_GLOBAL_L2 |
|
||||
SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
|
||||
SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
|
||||
/* Take the maximum of the old and new count. If the new count is lower,
|
||||
* dirtying is needed to disable the unbound colorbuffers.
|
||||
@@ -3458,7 +3459,8 @@ static void si_texture_barrier(struct pipe_context *ctx)
|
||||
|
||||
sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
|
||||
SI_CONTEXT_INV_GLOBAL_L2 |
|
||||
SI_CONTEXT_FLUSH_AND_INV_CB;
|
||||
SI_CONTEXT_FLUSH_AND_INV_CB |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
}
|
||||
|
||||
static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
|
||||
@@ -3467,7 +3469,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
|
||||
|
||||
/* Subsequent commands must wait for all shader invocations to
|
||||
* complete. */
|
||||
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
|
||||
sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
|
||||
if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
|
||||
sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
|
||||
@@ -3477,7 +3480,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
|
||||
PIPE_BARRIER_SHADER_BUFFER |
|
||||
PIPE_BARRIER_TEXTURE |
|
||||
PIPE_BARRIER_IMAGE |
|
||||
PIPE_BARRIER_STREAMOUT_BUFFER)) {
|
||||
PIPE_BARRIER_STREAMOUT_BUFFER |
|
||||
PIPE_BARRIER_GLOBAL_BUFFER)) {
|
||||
/* As far as I can tell, L1 contents are written back to L2
|
||||
* automatically at end of shader, but the contents of other
|
||||
* L1 caches might still be stale. */
|
||||
|
||||
Reference in New Issue
Block a user