radeonsi: move barriers out of si_compute_clear_copy_buffer & si_cp_dma_*

Some places don't need si_barrier_before_simple_buffer_op.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31193>
This commit is contained in:
Marek Olšák
2024-08-22 15:45:40 -04:00
committed by Marge Bot
parent 5d607348a3
commit 80a6f568e2
5 changed files with 43 additions and 20 deletions
+13 -4
View File
@@ -382,10 +382,8 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource
struct pipe_grid_info grid = {};
set_work_size(&grid, dispatch.workgroup_size, 1, 1, dispatch.num_threads, 1, 1);
si_barrier_before_simple_buffer_op(sctx, flags, dst, src);
si_launch_grid_internal_ssbos(sctx, &grid, shader, flags, dispatch.num_ssbos, sb,
is_copy ? 0x2 : 0x1);
si_barrier_after_simple_buffer_op(sctx, flags, dst, src);
return true;
}
@@ -408,11 +406,15 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
if (util_lower_clearsize_to_dword(clear_value, (int*)&clear_value_size, &clamped))
clear_value = &clamped;
si_barrier_before_simple_buffer_op(sctx, flags, dst, NULL);
if (method != SI_CP_DMA_CLEAR_METHOD &&
si_compute_clear_copy_buffer(sctx, dst, offset, NULL, 0, size, clear_value,
clear_value_size, flags, 0,
method == SI_AUTO_SELECT_CLEAR_METHOD))
method == SI_AUTO_SELECT_CLEAR_METHOD)) {
si_barrier_after_simple_buffer_op(sctx, flags, dst, NULL);
return;
}
uint64_t aligned_size = size & ~3ull;
if (aligned_size) {
@@ -421,6 +423,8 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, offset, aligned_size, *clear_value, flags);
}
si_barrier_after_simple_buffer_op(sctx, flags, dst, NULL);
offset += aligned_size;
size -= aligned_size;
@@ -455,11 +459,16 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p
if (!size)
return;
si_barrier_before_simple_buffer_op(sctx, flags, dst, src);
if (si_compute_clear_copy_buffer(sctx, dst, dst_offset, src, src_offset, size, NULL, 0, flags,
0, true))
0, true)) {
si_barrier_after_simple_buffer_op(sctx, flags, dst, src);
return;
}
si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, flags);
si_barrier_after_simple_buffer_op(sctx, flags, dst, src);
}
void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
-6
View File
@@ -156,8 +156,6 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
si_barrier_before_simple_buffer_op(sctx, user_flags, dst, NULL);
/* Mark the buffer range of destination as valid (initialized),
* so that transfer_map knows it should wait for the GPU when mapping
* that range. */
@@ -187,7 +185,6 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
va += byte_count;
}
si_barrier_after_simple_buffer_op(sctx, user_flags, dst, NULL);
sctx->num_cp_dma_calls++;
}
@@ -245,8 +242,6 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
}
si_barrier_before_simple_buffer_op(sctx, user_flags, dst, src);
/* Mark the buffer range of destination as valid (initialized),
* so that transfer_map knows it should wait for the GPU when mapping
* that range.
@@ -344,7 +339,6 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
if (realign_size)
si_cp_dma_realign_engine(sctx, realign_size, user_flags, &is_first);
si_barrier_after_simple_buffer_op(sctx, user_flags, dst, src);
sctx->num_cp_dma_calls++;
}
@@ -57,8 +57,10 @@ void si_init_cp_reg_shadowing(struct si_context *sctx)
if (sctx->shadowing.registers) {
/* We need to clear the shadowed reg buffer. */
unsigned flags = SI_OP_SYNC_AFTER;
si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, &sctx->shadowing.registers->b.b,
0, sctx->shadowing.registers->bo_size, 0, SI_OP_SYNC_AFTER);
0, sctx->shadowing.registers->bo_size, 0, flags);
si_barrier_after_simple_buffer_op(sctx, flags, &sctx->shadowing.registers->b.b, NULL);
/* Create the shadowing preamble. (allocate enough dwords because the preamble is large) */
struct si_pm4_state *shadowing_preamble = si_pm4_create_sized(sctx->screen, 256, false);
+3 -1
View File
@@ -985,8 +985,10 @@ static void post_upload_binary(struct si_screen *sscreen, struct si_shader *shad
* a compute shader, and we can't use shaders in the code that is responsible for making
* them available.
*/
unsigned flags = SI_OP_SYNC_AFTER;
si_cp_dma_copy_buffer(upload_ctx, &shader->bo->b.b, staging, 0, staging_offset,
binary_size, SI_OP_SYNC_AFTER);
binary_size, flags);
si_barrier_after_simple_buffer_op(upload_ctx, flags, &shader->bo->b.b, staging);
upload_ctx->flags |= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_L2;
#if 0 /* debug: validate whether the copy was successful */
@@ -227,8 +227,11 @@ void si_test_dma_perf(struct si_screen *sscreen)
success = false;
continue;
}
si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
SI_OP_SYNC_BEFORE_AFTER);
unsigned flags = SI_OP_SYNC_BEFORE_AFTER;
si_barrier_before_simple_buffer_op(sctx, flags, dst, src);
si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, flags);
si_barrier_after_simple_buffer_op(sctx, flags, dst, src);
} else {
/* CP DMA clears must be aligned to 4 bytes. */
if (dst_offset % 4 || size % 4 ||
@@ -237,17 +240,25 @@ void si_test_dma_perf(struct si_screen *sscreen)
success = false;
continue;
}
assert(clear_value_size == 4);
unsigned flags = SI_OP_SYNC_BEFORE_AFTER;
si_barrier_before_simple_buffer_op(sctx, flags, dst, src);
si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, dst_offset, size,
clear_value[0], SI_OP_SYNC_BEFORE_AFTER);
clear_value[0], flags);
si_barrier_after_simple_buffer_op(sctx, flags, dst, src);
}
} else {
/* Compute */
unsigned flags = SI_OP_SYNC_BEFORE_AFTER;
si_barrier_before_simple_buffer_op(sctx, flags, dst, src);
success &=
si_compute_clear_copy_buffer(sctx, dst, dst_offset, src, src_offset,
size, clear_value, clear_value_size,
SI_OP_SYNC_BEFORE_AFTER, dwords_per_thread,
false);
flags, dwords_per_thread, false);
si_barrier_after_simple_buffer_op(sctx, flags, dst, src);
}
sctx->flags |= SI_CONTEXT_INV_L2;
@@ -475,9 +486,12 @@ void si_test_clear_buffer(struct si_screen *sscreen)
printf("%s, ", COLOR_RESET);
fflush(stdout);
unsigned flags = SI_OP_SYNC_BEFORE_AFTER;
si_barrier_before_simple_buffer_op(sctx, flags, dst, NULL);
bool done = si_compute_clear_copy_buffer(sctx, dst, dst_offset, NULL, 0, op_size,
(uint32_t*)clear_value, clear_value_size,
SI_OP_SYNC_BEFORE_AFTER, dwords_per_thread, false);
flags, dwords_per_thread, false);
si_barrier_after_simple_buffer_op(sctx, flags, dst, NULL);
if (done) {
pipe_buffer_read(ctx, dst, 0, buf_size, read_dst_buffer);
@@ -580,9 +594,11 @@ void si_test_copy_buffer(struct si_screen *sscreen)
}
fflush(stdout);
unsigned flags = SI_OP_SYNC_BEFORE_AFTER;
si_barrier_before_simple_buffer_op(sctx, flags, dst, src);
bool done = si_compute_clear_copy_buffer(sctx, dst, dst_offset, src, src_offset, op_size,
NULL, 0, SI_OP_SYNC_BEFORE_AFTER,
dwords_per_thread, false);
NULL, 0, flags, dwords_per_thread, false);
si_barrier_after_simple_buffer_op(sctx, flags, dst, src);
if (done) {
pipe_buffer_read(ctx, dst, 0, buf_size, read_dst_buffer);