From 83d8b3bc1a05ce482bf4b1e82af009205aaba557 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 27 Apr 2024 01:58:36 -0400 Subject: [PATCH] radeonsi: simplify the complex clear/copy_buffer shader Remove the logic that we don't need. In a future commit, it will be extended to optimize aspects of buffer clears and copies that need to be optimized. Changes: - remove the logic that generated multiple loads/stores per thread, only 1 load and store can occur in the shader now, allowing clearing/ copying max 4 dwords per thread - put the src buffer in SSBO slot 0, and the dst buffer in SSBO slot 1 Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_compute_blit.c | 59 ++++++---------- src/gallium/drivers/radeonsi/si_pipe.h | 6 +- .../drivers/radeonsi/si_shaderlib_nir.c | 70 ++++--------------- .../drivers/radeonsi/si_test_dma_perf.c | 8 +-- 4 files changed, 39 insertions(+), 104 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index a007035604a..e346fea08f8 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -323,61 +323,42 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0); assert(!src || src_offset + size <= src->width0); - /* The memory accesses are coalesced, meaning that the 1st instruction writes - * the 1st contiguous block of data for the whole wave, the 2nd instruction - * writes the 2nd contiguous block of data, etc. - */ - unsigned dwords_per_thread = - src ? SI_COMPUTE_COPY_DW_PER_THREAD : SI_COMPUTE_CLEAR_DW_PER_THREAD; - unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4); - unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread; - /* The shader declares the block size like this: */ - unsigned block_size = si_determine_wave_size(sctx->screen, NULL); - unsigned dwords_per_wave = dwords_per_thread * block_size; - - unsigned num_dwords = size / 4; - unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction); + bool is_copy = src != NULL; + unsigned dwords_per_thread = 4; + unsigned num_threads = DIV_ROUND_UP(size, dwords_per_thread * 4); struct pipe_grid_info info = {}; - info.block[0] = MIN2(block_size, num_instructions); + info.block[0] = 64; info.block[1] = 1; info.block[2] = 1; - info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave); + info.grid[0] = DIV_ROUND_UP(num_threads, 64); info.grid[1] = 1; info.grid[2] = 1; + info.last_block[0] = num_threads % 64; struct pipe_shader_buffer sb[2] = {}; - sb[0].buffer = dst; - sb[0].buffer_offset = dst_offset; - sb[0].buffer_size = size; + sb[is_copy].buffer = dst; + sb[is_copy].buffer_offset = dst_offset; + sb[is_copy].buffer_size = size; - if (src) { - sb[1].buffer = src; - sb[1].buffer_offset = src_offset; - sb[1].buffer_size = size; - - if (!sctx->cs_copy_buffer) { - sctx->cs_copy_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_COPY_DW_PER_THREAD, - true); - } - - si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher, - 2, sb, 0x1); + if (is_copy) { + sb[0].buffer = src; + sb[0].buffer_offset = src_offset; + sb[0].buffer_size = size; } else { assert(clear_value_size >= 4 && clear_value_size <= 16 && util_is_power_of_two_or_zero(clear_value_size)); for (unsigned i = 0; i < 4; i++) sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)]; - - if (!sctx->cs_clear_buffer) { - sctx->cs_clear_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD, - false); - } - - si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher, - 1, sb, 0x1); } + + void **shader = is_copy ? &sctx->cs_copy_buffer : &sctx->cs_clear_buffer; + if (!*shader) + *shader = si_create_dma_compute_shader(sctx, dwords_per_thread, !is_copy); + + si_launch_grid_internal_ssbos(sctx, &info, *shader, flags, coher, is_copy ? 2 : 1, sb, + is_copy ? 0x2 : 0x1); } void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index a12bcbc199f..c1c632e2081 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -46,10 +46,6 @@ struct ac_llvm_compiler; /* Alignment for optimal CP DMA performance. */ #define SI_CPDMA_ALIGNMENT 32 -/* Tunables for compute-based clear_buffer and copy_buffer: */ -#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4 -#define SI_COMPUTE_COPY_DW_PER_THREAD 4 - /* Pipeline & streamout query controls. */ #define SI_CONTEXT_START_PIPELINE_STATS (1 << 0) #define SI_CONTEXT_STOP_PIPELINE_STATS (1 << 1) @@ -1727,7 +1723,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers); void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread, - bool is_copy); + bool is_clear); void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx); void *si_create_clear_buffer_rmw_cs(struct si_context *sctx); void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type); diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c index 44c739fb297..efc333c9706 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c @@ -670,73 +670,31 @@ void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx) /* Create a compute shader implementing clear_buffer or copy_buffer. */ void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread, - bool is_copy) + bool is_clear) { - assert(util_is_power_of_two_nonzero(num_dwords_per_thread)); + assert(util_is_power_of_two_nonzero(num_dwords_per_thread) && num_dwords_per_thread <= 4); nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options, "create_dma_compute"); - unsigned wg_size = 64; - - b.shader->info.workgroup_size[0] = wg_size; + b.shader->info.workgroup_size[0] = 64; b.shader->info.workgroup_size[1] = 1; b.shader->info.workgroup_size[2] = 1; - b.shader->info.num_ssbos = 1; + b.shader->info.num_ssbos = is_clear ? 1 : 2; + b.shader->info.cs.user_data_components_amd = is_clear ? num_dwords_per_thread : 0; - unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4); - unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned)); + nir_def *thread_id = get_global_ids(&b, 1); + /* Convert the global thread ID into bytes. */ + nir_def *offset = nir_imul_imm(&b, thread_id, 4 * num_dwords_per_thread); + nir_def *value; - for (unsigned i = 0; i < num_mem_ops; i++) { - if (i * 4 < num_dwords_per_thread) - inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4); - } - - /* If there are multiple stores, - * the first store writes into 0 * wavesize + tid, - * the 2nd store writes into 1 * wavesize + tid, - * the 3rd store writes into 2 * wavesize + tid, etc. - */ - nir_def *store_address = - nir_iadd(&b, nir_imul_imm(&b, nir_channel(&b, nir_load_workgroup_id(&b), 0), - wg_size * num_mem_ops), - nir_channel(&b, nir_load_local_invocation_id(&b), 0)); - - /* Convert from a "store size unit" into bytes. */ - store_address = nir_imul_imm(&b, store_address, 4 * inst_dwords[0]); - - nir_def *load_address = store_address, *value = NULL, *values[num_mem_ops]; - - if (is_copy) { - b.shader->info.num_ssbos++; + if (is_clear) { + value = nir_trim_vector(&b, nir_load_user_data_amd(&b), num_dwords_per_thread); } else { - b.shader->info.cs.user_data_components_amd = inst_dwords[0]; - value = nir_trim_vector(&b, nir_load_user_data_amd(&b), inst_dwords[0]); + value = nir_load_ssbo(&b, num_dwords_per_thread, 32, nir_imm_int(&b, 0), offset, + .access = ACCESS_RESTRICT); } - /* Distance between a load and a store for latency hiding. */ - unsigned load_store_distance = is_copy ? 8 : 0; - - for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) { - int d = i - load_store_distance; - - if (is_copy && i < num_mem_ops) { - if (i) { - load_address = nir_iadd(&b, load_address, - nir_imm_int(&b, 4 * inst_dwords[i] * wg_size)); - } - values[i] = nir_load_ssbo(&b, inst_dwords[i], 32, nir_imm_int(&b, 1), load_address, - .access = ACCESS_RESTRICT); - } - - if (d >= 0) { - if (d) { - store_address = nir_iadd(&b, store_address, - nir_imm_int(&b, 4 * inst_dwords[d] * wg_size)); - } - nir_store_ssbo(&b, is_copy ? values[d] : value, nir_imm_int(&b, 0), store_address, - .access = ACCESS_RESTRICT); - } - } + nir_store_ssbo(&b, value, nir_imm_int(&b, !is_clear), offset, .access = ACCESS_RESTRICT); return create_shader_state(sctx, b.shader); } diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c index 3d58b6704d3..853a649957a 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c +++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c @@ -192,12 +192,12 @@ void si_test_dma_perf(struct si_screen *sscreen) info.grid[2] = 1; struct pipe_shader_buffer sb[2] = {}; - sb[0].buffer = dst; - sb[0].buffer_size = size; + sb[is_copy].buffer = dst; + sb[is_copy].buffer_size = size; if (is_copy) { - sb[1].buffer = src; - sb[1].buffer_size = size; + sb[0].buffer = src; + sb[0].buffer_size = size; } else { for (unsigned i = 0; i < 4; i++) sctx->cs_user_data[i] = clear_value;