radeonsi: move barriers out of si_compute_clear_copy_buffer & si_cp_dma_*
Some places don't need si_barrier_before_simple_buffer_op. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31193>
This commit is contained in:
@@ -382,10 +382,8 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource
|
||||
struct pipe_grid_info grid = {};
|
||||
set_work_size(&grid, dispatch.workgroup_size, 1, 1, dispatch.num_threads, 1, 1);
|
||||
|
||||
si_barrier_before_simple_buffer_op(sctx, flags, dst, src);
|
||||
si_launch_grid_internal_ssbos(sctx, &grid, shader, flags, dispatch.num_ssbos, sb,
|
||||
is_copy ? 0x2 : 0x1);
|
||||
si_barrier_after_simple_buffer_op(sctx, flags, dst, src);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -408,11 +406,15 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
||||
if (util_lower_clearsize_to_dword(clear_value, (int*)&clear_value_size, &clamped))
|
||||
clear_value = &clamped;
|
||||
|
||||
si_barrier_before_simple_buffer_op(sctx, flags, dst, NULL);
|
||||
|
||||
if (method != SI_CP_DMA_CLEAR_METHOD &&
|
||||
si_compute_clear_copy_buffer(sctx, dst, offset, NULL, 0, size, clear_value,
|
||||
clear_value_size, flags, 0,
|
||||
method == SI_AUTO_SELECT_CLEAR_METHOD))
|
||||
method == SI_AUTO_SELECT_CLEAR_METHOD)) {
|
||||
si_barrier_after_simple_buffer_op(sctx, flags, dst, NULL);
|
||||
return;
|
||||
}
|
||||
|
||||
uint64_t aligned_size = size & ~3ull;
|
||||
if (aligned_size) {
|
||||
@@ -421,6 +423,8 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
||||
si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, offset, aligned_size, *clear_value, flags);
|
||||
}
|
||||
|
||||
si_barrier_after_simple_buffer_op(sctx, flags, dst, NULL);
|
||||
|
||||
offset += aligned_size;
|
||||
size -= aligned_size;
|
||||
|
||||
@@ -455,11 +459,16 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p
|
||||
if (!size)
|
||||
return;
|
||||
|
||||
si_barrier_before_simple_buffer_op(sctx, flags, dst, src);
|
||||
|
||||
if (si_compute_clear_copy_buffer(sctx, dst, dst_offset, src, src_offset, size, NULL, 0, flags,
|
||||
0, true))
|
||||
0, true)) {
|
||||
si_barrier_after_simple_buffer_op(sctx, flags, dst, src);
|
||||
return;
|
||||
}
|
||||
|
||||
si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, flags);
|
||||
si_barrier_after_simple_buffer_op(sctx, flags, dst, src);
|
||||
}
|
||||
|
||||
void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
|
||||
|
||||
@@ -156,8 +156,6 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
|
||||
si_barrier_before_simple_buffer_op(sctx, user_flags, dst, NULL);
|
||||
|
||||
/* Mark the buffer range of destination as valid (initialized),
|
||||
* so that transfer_map knows it should wait for the GPU when mapping
|
||||
* that range. */
|
||||
@@ -187,7 +185,6 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
|
||||
va += byte_count;
|
||||
}
|
||||
|
||||
si_barrier_after_simple_buffer_op(sctx, user_flags, dst, NULL);
|
||||
sctx->num_cp_dma_calls++;
|
||||
}
|
||||
|
||||
@@ -245,8 +242,6 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
|
||||
}
|
||||
|
||||
si_barrier_before_simple_buffer_op(sctx, user_flags, dst, src);
|
||||
|
||||
/* Mark the buffer range of destination as valid (initialized),
|
||||
* so that transfer_map knows it should wait for the GPU when mapping
|
||||
* that range.
|
||||
@@ -344,7 +339,6 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
||||
if (realign_size)
|
||||
si_cp_dma_realign_engine(sctx, realign_size, user_flags, &is_first);
|
||||
|
||||
si_barrier_after_simple_buffer_op(sctx, user_flags, dst, src);
|
||||
sctx->num_cp_dma_calls++;
|
||||
}
|
||||
|
||||
|
||||
@@ -57,8 +57,10 @@ void si_init_cp_reg_shadowing(struct si_context *sctx)
|
||||
|
||||
if (sctx->shadowing.registers) {
|
||||
/* We need to clear the shadowed reg buffer. */
|
||||
unsigned flags = SI_OP_SYNC_AFTER;
|
||||
si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, &sctx->shadowing.registers->b.b,
|
||||
0, sctx->shadowing.registers->bo_size, 0, SI_OP_SYNC_AFTER);
|
||||
0, sctx->shadowing.registers->bo_size, 0, flags);
|
||||
si_barrier_after_simple_buffer_op(sctx, flags, &sctx->shadowing.registers->b.b, NULL);
|
||||
|
||||
/* Create the shadowing preamble. (allocate enough dwords because the preamble is large) */
|
||||
struct si_pm4_state *shadowing_preamble = si_pm4_create_sized(sctx->screen, 256, false);
|
||||
|
||||
@@ -985,8 +985,10 @@ static void post_upload_binary(struct si_screen *sscreen, struct si_shader *shad
|
||||
* a compute shader, and we can't use shaders in the code that is responsible for making
|
||||
* them available.
|
||||
*/
|
||||
unsigned flags = SI_OP_SYNC_AFTER;
|
||||
si_cp_dma_copy_buffer(upload_ctx, &shader->bo->b.b, staging, 0, staging_offset,
|
||||
binary_size, SI_OP_SYNC_AFTER);
|
||||
binary_size, flags);
|
||||
si_barrier_after_simple_buffer_op(upload_ctx, flags, &shader->bo->b.b, staging);
|
||||
upload_ctx->flags |= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_L2;
|
||||
|
||||
#if 0 /* debug: validate whether the copy was successful */
|
||||
|
||||
@@ -227,8 +227,11 @@ void si_test_dma_perf(struct si_screen *sscreen)
|
||||
success = false;
|
||||
continue;
|
||||
}
|
||||
si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
|
||||
SI_OP_SYNC_BEFORE_AFTER);
|
||||
|
||||
unsigned flags = SI_OP_SYNC_BEFORE_AFTER;
|
||||
si_barrier_before_simple_buffer_op(sctx, flags, dst, src);
|
||||
si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, flags);
|
||||
si_barrier_after_simple_buffer_op(sctx, flags, dst, src);
|
||||
} else {
|
||||
/* CP DMA clears must be aligned to 4 bytes. */
|
||||
if (dst_offset % 4 || size % 4 ||
|
||||
@@ -237,17 +240,25 @@ void si_test_dma_perf(struct si_screen *sscreen)
|
||||
success = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
assert(clear_value_size == 4);
|
||||
unsigned flags = SI_OP_SYNC_BEFORE_AFTER;
|
||||
|
||||
si_barrier_before_simple_buffer_op(sctx, flags, dst, src);
|
||||
si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, dst_offset, size,
|
||||
clear_value[0], SI_OP_SYNC_BEFORE_AFTER);
|
||||
clear_value[0], flags);
|
||||
si_barrier_after_simple_buffer_op(sctx, flags, dst, src);
|
||||
}
|
||||
} else {
|
||||
/* Compute */
|
||||
unsigned flags = SI_OP_SYNC_BEFORE_AFTER;
|
||||
|
||||
si_barrier_before_simple_buffer_op(sctx, flags, dst, src);
|
||||
success &=
|
||||
si_compute_clear_copy_buffer(sctx, dst, dst_offset, src, src_offset,
|
||||
size, clear_value, clear_value_size,
|
||||
SI_OP_SYNC_BEFORE_AFTER, dwords_per_thread,
|
||||
false);
|
||||
flags, dwords_per_thread, false);
|
||||
si_barrier_after_simple_buffer_op(sctx, flags, dst, src);
|
||||
}
|
||||
|
||||
sctx->flags |= SI_CONTEXT_INV_L2;
|
||||
@@ -475,9 +486,12 @@ void si_test_clear_buffer(struct si_screen *sscreen)
|
||||
printf("%s, ", COLOR_RESET);
|
||||
fflush(stdout);
|
||||
|
||||
unsigned flags = SI_OP_SYNC_BEFORE_AFTER;
|
||||
si_barrier_before_simple_buffer_op(sctx, flags, dst, NULL);
|
||||
bool done = si_compute_clear_copy_buffer(sctx, dst, dst_offset, NULL, 0, op_size,
|
||||
(uint32_t*)clear_value, clear_value_size,
|
||||
SI_OP_SYNC_BEFORE_AFTER, dwords_per_thread, false);
|
||||
flags, dwords_per_thread, false);
|
||||
si_barrier_after_simple_buffer_op(sctx, flags, dst, NULL);
|
||||
|
||||
if (done) {
|
||||
pipe_buffer_read(ctx, dst, 0, buf_size, read_dst_buffer);
|
||||
@@ -580,9 +594,11 @@ void si_test_copy_buffer(struct si_screen *sscreen)
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
unsigned flags = SI_OP_SYNC_BEFORE_AFTER;
|
||||
si_barrier_before_simple_buffer_op(sctx, flags, dst, src);
|
||||
bool done = si_compute_clear_copy_buffer(sctx, dst, dst_offset, src, src_offset, op_size,
|
||||
NULL, 0, SI_OP_SYNC_BEFORE_AFTER,
|
||||
dwords_per_thread, false);
|
||||
NULL, 0, flags, dwords_per_thread, false);
|
||||
si_barrier_after_simple_buffer_op(sctx, flags, dst, src);
|
||||
|
||||
if (done) {
|
||||
pipe_buffer_read(ctx, dst, 0, buf_size, read_dst_buffer);
|
||||
|
||||
Reference in New Issue
Block a user