radeonsi: simplify the complex clear/copy_buffer shader

Remove the logic that we don't need. In a future commit, it will be
extended to optimize aspects of buffer clears and copies that need to be
optimized.

Changes:
- remove the logic that generated multiple loads/stores per thread,
  only 1 load and store can occur in the shader now, allowing clearing/
  copying max 4 dwords per thread
- put the src buffer in SSBO slot 0, and the dst buffer in SSBO slot 1

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29053>
This commit is contained in:
Marek Olšák
2024-04-27 01:58:36 -04:00
committed by Marge Bot
parent 92497d1c8f
commit 83d8b3bc1a
4 changed files with 39 additions and 104 deletions
+20 -39
View File
@@ -323,61 +323,42 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
assert(!src || src_offset + size <= src->width0);
/* The memory accesses are coalesced, meaning that the 1st instruction writes
* the 1st contiguous block of data for the whole wave, the 2nd instruction
* writes the 2nd contiguous block of data, etc.
*/
unsigned dwords_per_thread =
src ? SI_COMPUTE_COPY_DW_PER_THREAD : SI_COMPUTE_CLEAR_DW_PER_THREAD;
unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
/* The shader declares the block size like this: */
unsigned block_size = si_determine_wave_size(sctx->screen, NULL);
unsigned dwords_per_wave = dwords_per_thread * block_size;
unsigned num_dwords = size / 4;
unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
bool is_copy = src != NULL;
unsigned dwords_per_thread = 4;
unsigned num_threads = DIV_ROUND_UP(size, dwords_per_thread * 4);
struct pipe_grid_info info = {};
info.block[0] = MIN2(block_size, num_instructions);
info.block[0] = 64;
info.block[1] = 1;
info.block[2] = 1;
info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
info.grid[0] = DIV_ROUND_UP(num_threads, 64);
info.grid[1] = 1;
info.grid[2] = 1;
info.last_block[0] = num_threads % 64;
struct pipe_shader_buffer sb[2] = {};
sb[0].buffer = dst;
sb[0].buffer_offset = dst_offset;
sb[0].buffer_size = size;
sb[is_copy].buffer = dst;
sb[is_copy].buffer_offset = dst_offset;
sb[is_copy].buffer_size = size;
if (src) {
sb[1].buffer = src;
sb[1].buffer_offset = src_offset;
sb[1].buffer_size = size;
if (!sctx->cs_copy_buffer) {
sctx->cs_copy_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_COPY_DW_PER_THREAD,
true);
}
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher,
2, sb, 0x1);
if (is_copy) {
sb[0].buffer = src;
sb[0].buffer_offset = src_offset;
sb[0].buffer_size = size;
} else {
assert(clear_value_size >= 4 && clear_value_size <= 16 &&
util_is_power_of_two_or_zero(clear_value_size));
for (unsigned i = 0; i < 4; i++)
sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
if (!sctx->cs_clear_buffer) {
sctx->cs_clear_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD,
false);
}
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher,
1, sb, 0x1);
}
void **shader = is_copy ? &sctx->cs_copy_buffer : &sctx->cs_clear_buffer;
if (!*shader)
*shader = si_create_dma_compute_shader(sctx, dwords_per_thread, !is_copy);
si_launch_grid_internal_ssbos(sctx, &info, *shader, flags, coher, is_copy ? 2 : 1, sb,
is_copy ? 0x2 : 0x1);
}
void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
+1 -5
View File
@@ -46,10 +46,6 @@ struct ac_llvm_compiler;
/* Alignment for optimal CP DMA performance. */
#define SI_CPDMA_ALIGNMENT 32
/* Tunables for compute-based clear_buffer and copy_buffer: */
#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
#define SI_COMPUTE_COPY_DW_PER_THREAD 4
/* Pipeline & streamout query controls. */
#define SI_CONTEXT_START_PIPELINE_STATS (1 << 0)
#define SI_CONTEXT_STOP_PIPELINE_STATS (1 << 1)
@@ -1727,7 +1723,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
unsigned num_layers);
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
bool is_copy);
bool is_clear);
void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx);
void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
+14 -56
View File
@@ -670,73 +670,31 @@ void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
/* Create a compute shader implementing clear_buffer or copy_buffer. */
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
bool is_copy)
bool is_clear)
{
assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
assert(util_is_power_of_two_nonzero(num_dwords_per_thread) && num_dwords_per_thread <= 4);
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
"create_dma_compute");
unsigned wg_size = 64;
b.shader->info.workgroup_size[0] = wg_size;
b.shader->info.workgroup_size[0] = 64;
b.shader->info.workgroup_size[1] = 1;
b.shader->info.workgroup_size[2] = 1;
b.shader->info.num_ssbos = 1;
b.shader->info.num_ssbos = is_clear ? 1 : 2;
b.shader->info.cs.user_data_components_amd = is_clear ? num_dwords_per_thread : 0;
unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
nir_def *thread_id = get_global_ids(&b, 1);
/* Convert the global thread ID into bytes. */
nir_def *offset = nir_imul_imm(&b, thread_id, 4 * num_dwords_per_thread);
nir_def *value;
for (unsigned i = 0; i < num_mem_ops; i++) {
if (i * 4 < num_dwords_per_thread)
inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);
}
/* If there are multiple stores,
* the first store writes into 0 * wavesize + tid,
* the 2nd store writes into 1 * wavesize + tid,
* the 3rd store writes into 2 * wavesize + tid, etc.
*/
nir_def *store_address =
nir_iadd(&b, nir_imul_imm(&b, nir_channel(&b, nir_load_workgroup_id(&b), 0),
wg_size * num_mem_ops),
nir_channel(&b, nir_load_local_invocation_id(&b), 0));
/* Convert from a "store size unit" into bytes. */
store_address = nir_imul_imm(&b, store_address, 4 * inst_dwords[0]);
nir_def *load_address = store_address, *value = NULL, *values[num_mem_ops];
if (is_copy) {
b.shader->info.num_ssbos++;
if (is_clear) {
value = nir_trim_vector(&b, nir_load_user_data_amd(&b), num_dwords_per_thread);
} else {
b.shader->info.cs.user_data_components_amd = inst_dwords[0];
value = nir_trim_vector(&b, nir_load_user_data_amd(&b), inst_dwords[0]);
value = nir_load_ssbo(&b, num_dwords_per_thread, 32, nir_imm_int(&b, 0), offset,
.access = ACCESS_RESTRICT);
}
/* Distance between a load and a store for latency hiding. */
unsigned load_store_distance = is_copy ? 8 : 0;
for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
int d = i - load_store_distance;
if (is_copy && i < num_mem_ops) {
if (i) {
load_address = nir_iadd(&b, load_address,
nir_imm_int(&b, 4 * inst_dwords[i] * wg_size));
}
values[i] = nir_load_ssbo(&b, inst_dwords[i], 32, nir_imm_int(&b, 1), load_address,
.access = ACCESS_RESTRICT);
}
if (d >= 0) {
if (d) {
store_address = nir_iadd(&b, store_address,
nir_imm_int(&b, 4 * inst_dwords[d] * wg_size));
}
nir_store_ssbo(&b, is_copy ? values[d] : value, nir_imm_int(&b, 0), store_address,
.access = ACCESS_RESTRICT);
}
}
nir_store_ssbo(&b, value, nir_imm_int(&b, !is_clear), offset, .access = ACCESS_RESTRICT);
return create_shader_state(sctx, b.shader);
}
@@ -192,12 +192,12 @@ void si_test_dma_perf(struct si_screen *sscreen)
info.grid[2] = 1;
struct pipe_shader_buffer sb[2] = {};
sb[0].buffer = dst;
sb[0].buffer_size = size;
sb[is_copy].buffer = dst;
sb[is_copy].buffer_size = size;
if (is_copy) {
sb[1].buffer = src;
sb[1].buffer_size = size;
sb[0].buffer = src;
sb[0].buffer_size = size;
} else {
for (unsigned i = 0; i < 4; i++)
sctx->cs_user_data[i] = clear_value;