diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index d629ee77920..5025e005e3c 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -382,10 +382,8 @@ bool si_compute_clear_copy_buffer(struct si_context *sctx, struct pipe_resource struct pipe_grid_info grid = {}; set_work_size(&grid, dispatch.workgroup_size, 1, 1, dispatch.num_threads, 1, 1); - si_barrier_before_simple_buffer_op(sctx, flags, dst, src); si_launch_grid_internal_ssbos(sctx, &grid, shader, flags, dispatch.num_ssbos, sb, is_copy ? 0x2 : 0x1); - si_barrier_after_simple_buffer_op(sctx, flags, dst, src); return true; } @@ -408,11 +406,15 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, if (util_lower_clearsize_to_dword(clear_value, (int*)&clear_value_size, &clamped)) clear_value = &clamped; + si_barrier_before_simple_buffer_op(sctx, flags, dst, NULL); + if (method != SI_CP_DMA_CLEAR_METHOD && si_compute_clear_copy_buffer(sctx, dst, offset, NULL, 0, size, clear_value, clear_value_size, flags, 0, - method == SI_AUTO_SELECT_CLEAR_METHOD)) + method == SI_AUTO_SELECT_CLEAR_METHOD)) { + si_barrier_after_simple_buffer_op(sctx, flags, dst, NULL); return; + } uint64_t aligned_size = size & ~3ull; if (aligned_size) { @@ -421,6 +423,8 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, offset, aligned_size, *clear_value, flags); } + si_barrier_after_simple_buffer_op(sctx, flags, dst, NULL); + offset += aligned_size; size -= aligned_size; @@ -455,11 +459,16 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p if (!size) return; + si_barrier_before_simple_buffer_op(sctx, flags, dst, src); + if (si_compute_clear_copy_buffer(sctx, dst, dst_offset, src, src_offset, size, NULL, 0, flags, - 0, true)) + 0, true)) { + si_barrier_after_simple_buffer_op(sctx, flags, dst, src); return; + } si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, flags); + si_barrier_after_simple_buffer_op(sctx, flags, dst, src); } void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 1a5596475d7..56621bf5de0 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -156,8 +156,6 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } - si_barrier_before_simple_buffer_op(sctx, user_flags, dst, NULL); - /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ @@ -187,7 +185,6 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, va += byte_count; } - si_barrier_after_simple_buffer_op(sctx, user_flags, dst, NULL); sctx->num_cp_dma_calls++; } @@ -245,8 +242,6 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); } - si_barrier_before_simple_buffer_op(sctx, user_flags, dst, src); - /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. @@ -344,7 +339,6 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, if (realign_size) si_cp_dma_realign_engine(sctx, realign_size, user_flags, &is_first); - si_barrier_after_simple_buffer_op(sctx, user_flags, dst, src); sctx->num_cp_dma_calls++; } diff --git a/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c b/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c index ddac2a06cdf..6065478812b 100644 --- a/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c +++ b/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c @@ -57,8 +57,10 @@ void si_init_cp_reg_shadowing(struct si_context *sctx) if (sctx->shadowing.registers) { /* We need to clear the shadowed reg buffer. */ + unsigned flags = SI_OP_SYNC_AFTER; si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, &sctx->shadowing.registers->b.b, - 0, sctx->shadowing.registers->bo_size, 0, SI_OP_SYNC_AFTER); + 0, sctx->shadowing.registers->bo_size, 0, flags); + si_barrier_after_simple_buffer_op(sctx, flags, &sctx->shadowing.registers->b.b, NULL); /* Create the shadowing preamble. (allocate enough dwords because the preamble is large) */ struct si_pm4_state *shadowing_preamble = si_pm4_create_sized(sctx->screen, 256, false); diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 1cbf9c3a686..6829f9d4cef 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -985,8 +985,10 @@ static void post_upload_binary(struct si_screen *sscreen, struct si_shader *shad * a compute shader, and we can't use shaders in the code that is responsible for making * them available. */ + unsigned flags = SI_OP_SYNC_AFTER; si_cp_dma_copy_buffer(upload_ctx, &shader->bo->b.b, staging, 0, staging_offset, - binary_size, SI_OP_SYNC_AFTER); + binary_size, flags); + si_barrier_after_simple_buffer_op(upload_ctx, flags, &shader->bo->b.b, staging); upload_ctx->flags |= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_L2; #if 0 /* debug: validate whether the copy was successful */ diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c index 3549a32e0ed..6f833d1419f 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c +++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c @@ -227,8 +227,11 @@ void si_test_dma_perf(struct si_screen *sscreen) success = false; continue; } - si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, - SI_OP_SYNC_BEFORE_AFTER); + + unsigned flags = SI_OP_SYNC_BEFORE_AFTER; + si_barrier_before_simple_buffer_op(sctx, flags, dst, src); + si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, flags); + si_barrier_after_simple_buffer_op(sctx, flags, dst, src); } else { /* CP DMA clears must be aligned to 4 bytes. */ if (dst_offset % 4 || size % 4 || @@ -237,17 +240,25 @@ void si_test_dma_perf(struct si_screen *sscreen) success = false; continue; } + assert(clear_value_size == 4); + unsigned flags = SI_OP_SYNC_BEFORE_AFTER; + + si_barrier_before_simple_buffer_op(sctx, flags, dst, src); si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, dst_offset, size, - clear_value[0], SI_OP_SYNC_BEFORE_AFTER); + clear_value[0], flags); + si_barrier_after_simple_buffer_op(sctx, flags, dst, src); } } else { /* Compute */ + unsigned flags = SI_OP_SYNC_BEFORE_AFTER; + + si_barrier_before_simple_buffer_op(sctx, flags, dst, src); success &= si_compute_clear_copy_buffer(sctx, dst, dst_offset, src, src_offset, size, clear_value, clear_value_size, - SI_OP_SYNC_BEFORE_AFTER, dwords_per_thread, - false); + flags, dwords_per_thread, false); + si_barrier_after_simple_buffer_op(sctx, flags, dst, src); } sctx->flags |= SI_CONTEXT_INV_L2; @@ -475,9 +486,12 @@ void si_test_clear_buffer(struct si_screen *sscreen) printf("%s, ", COLOR_RESET); fflush(stdout); + unsigned flags = SI_OP_SYNC_BEFORE_AFTER; + si_barrier_before_simple_buffer_op(sctx, flags, dst, NULL); bool done = si_compute_clear_copy_buffer(sctx, dst, dst_offset, NULL, 0, op_size, (uint32_t*)clear_value, clear_value_size, - SI_OP_SYNC_BEFORE_AFTER, dwords_per_thread, false); + flags, dwords_per_thread, false); + si_barrier_after_simple_buffer_op(sctx, flags, dst, NULL); if (done) { pipe_buffer_read(ctx, dst, 0, buf_size, read_dst_buffer); @@ -580,9 +594,11 @@ void si_test_copy_buffer(struct si_screen *sscreen) } fflush(stdout); + unsigned flags = SI_OP_SYNC_BEFORE_AFTER; + si_barrier_before_simple_buffer_op(sctx, flags, dst, src); bool done = si_compute_clear_copy_buffer(sctx, dst, dst_offset, src, src_offset, op_size, - NULL, 0, SI_OP_SYNC_BEFORE_AFTER, - dwords_per_thread, false); + NULL, 0, flags, dwords_per_thread, false); + si_barrier_after_simple_buffer_op(sctx, flags, dst, src); if (done) { pipe_buffer_read(ctx, dst, 0, buf_size, read_dst_buffer);