radeonsi: simplify the complex clear/copy_buffer shader
Remove the logic that we don't need. In a future commit, it will be extended to optimize aspects of buffer clears and copies that need to be optimized. Changes: - remove the logic that generated multiple loads/stores per thread, only 1 load and store can occur in the shader now, allowing clearing/ copying max 4 dwords per thread - put the src buffer in SSBO slot 0, and the dst buffer in SSBO slot 1 Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29053>
This commit is contained in:
@@ -323,61 +323,42 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
|
||||
assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
|
||||
assert(!src || src_offset + size <= src->width0);
|
||||
|
||||
/* The memory accesses are coalesced, meaning that the 1st instruction writes
|
||||
* the 1st contiguous block of data for the whole wave, the 2nd instruction
|
||||
* writes the 2nd contiguous block of data, etc.
|
||||
*/
|
||||
unsigned dwords_per_thread =
|
||||
src ? SI_COMPUTE_COPY_DW_PER_THREAD : SI_COMPUTE_CLEAR_DW_PER_THREAD;
|
||||
unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
|
||||
unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
|
||||
/* The shader declares the block size like this: */
|
||||
unsigned block_size = si_determine_wave_size(sctx->screen, NULL);
|
||||
unsigned dwords_per_wave = dwords_per_thread * block_size;
|
||||
|
||||
unsigned num_dwords = size / 4;
|
||||
unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
|
||||
bool is_copy = src != NULL;
|
||||
unsigned dwords_per_thread = 4;
|
||||
unsigned num_threads = DIV_ROUND_UP(size, dwords_per_thread * 4);
|
||||
|
||||
struct pipe_grid_info info = {};
|
||||
info.block[0] = MIN2(block_size, num_instructions);
|
||||
info.block[0] = 64;
|
||||
info.block[1] = 1;
|
||||
info.block[2] = 1;
|
||||
info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
|
||||
info.grid[0] = DIV_ROUND_UP(num_threads, 64);
|
||||
info.grid[1] = 1;
|
||||
info.grid[2] = 1;
|
||||
info.last_block[0] = num_threads % 64;
|
||||
|
||||
struct pipe_shader_buffer sb[2] = {};
|
||||
sb[0].buffer = dst;
|
||||
sb[0].buffer_offset = dst_offset;
|
||||
sb[0].buffer_size = size;
|
||||
sb[is_copy].buffer = dst;
|
||||
sb[is_copy].buffer_offset = dst_offset;
|
||||
sb[is_copy].buffer_size = size;
|
||||
|
||||
if (src) {
|
||||
sb[1].buffer = src;
|
||||
sb[1].buffer_offset = src_offset;
|
||||
sb[1].buffer_size = size;
|
||||
|
||||
if (!sctx->cs_copy_buffer) {
|
||||
sctx->cs_copy_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_COPY_DW_PER_THREAD,
|
||||
true);
|
||||
}
|
||||
|
||||
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher,
|
||||
2, sb, 0x1);
|
||||
if (is_copy) {
|
||||
sb[0].buffer = src;
|
||||
sb[0].buffer_offset = src_offset;
|
||||
sb[0].buffer_size = size;
|
||||
} else {
|
||||
assert(clear_value_size >= 4 && clear_value_size <= 16 &&
|
||||
util_is_power_of_two_or_zero(clear_value_size));
|
||||
|
||||
for (unsigned i = 0; i < 4; i++)
|
||||
sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
|
||||
|
||||
if (!sctx->cs_clear_buffer) {
|
||||
sctx->cs_clear_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD,
|
||||
false);
|
||||
}
|
||||
|
||||
si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher,
|
||||
1, sb, 0x1);
|
||||
}
|
||||
|
||||
void **shader = is_copy ? &sctx->cs_copy_buffer : &sctx->cs_clear_buffer;
|
||||
if (!*shader)
|
||||
*shader = si_create_dma_compute_shader(sctx, dwords_per_thread, !is_copy);
|
||||
|
||||
si_launch_grid_internal_ssbos(sctx, &info, *shader, flags, coher, is_copy ? 2 : 1, sb,
|
||||
is_copy ? 0x2 : 0x1);
|
||||
}
|
||||
|
||||
void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
||||
|
||||
@@ -46,10 +46,6 @@ struct ac_llvm_compiler;
|
||||
/* Alignment for optimal CP DMA performance. */
|
||||
#define SI_CPDMA_ALIGNMENT 32
|
||||
|
||||
/* Tunables for compute-based clear_buffer and copy_buffer: */
|
||||
#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
|
||||
#define SI_COMPUTE_COPY_DW_PER_THREAD 4
|
||||
|
||||
/* Pipeline & streamout query controls. */
|
||||
#define SI_CONTEXT_START_PIPELINE_STATS (1 << 0)
|
||||
#define SI_CONTEXT_STOP_PIPELINE_STATS (1 << 1)
|
||||
@@ -1727,7 +1723,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
|
||||
void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
|
||||
unsigned num_layers);
|
||||
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
|
||||
bool is_copy);
|
||||
bool is_clear);
|
||||
void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx);
|
||||
void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
|
||||
void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
|
||||
|
||||
@@ -670,73 +670,31 @@ void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
|
||||
|
||||
/* Create a compute shader implementing clear_buffer or copy_buffer. */
|
||||
void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
|
||||
bool is_copy)
|
||||
bool is_clear)
|
||||
{
|
||||
assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
|
||||
assert(util_is_power_of_two_nonzero(num_dwords_per_thread) && num_dwords_per_thread <= 4);
|
||||
|
||||
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
|
||||
"create_dma_compute");
|
||||
unsigned wg_size = 64;
|
||||
|
||||
b.shader->info.workgroup_size[0] = wg_size;
|
||||
b.shader->info.workgroup_size[0] = 64;
|
||||
b.shader->info.workgroup_size[1] = 1;
|
||||
b.shader->info.workgroup_size[2] = 1;
|
||||
b.shader->info.num_ssbos = 1;
|
||||
b.shader->info.num_ssbos = is_clear ? 1 : 2;
|
||||
b.shader->info.cs.user_data_components_amd = is_clear ? num_dwords_per_thread : 0;
|
||||
|
||||
unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
|
||||
unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
|
||||
nir_def *thread_id = get_global_ids(&b, 1);
|
||||
/* Convert the global thread ID into bytes. */
|
||||
nir_def *offset = nir_imul_imm(&b, thread_id, 4 * num_dwords_per_thread);
|
||||
nir_def *value;
|
||||
|
||||
for (unsigned i = 0; i < num_mem_ops; i++) {
|
||||
if (i * 4 < num_dwords_per_thread)
|
||||
inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);
|
||||
}
|
||||
|
||||
/* If there are multiple stores,
|
||||
* the first store writes into 0 * wavesize + tid,
|
||||
* the 2nd store writes into 1 * wavesize + tid,
|
||||
* the 3rd store writes into 2 * wavesize + tid, etc.
|
||||
*/
|
||||
nir_def *store_address =
|
||||
nir_iadd(&b, nir_imul_imm(&b, nir_channel(&b, nir_load_workgroup_id(&b), 0),
|
||||
wg_size * num_mem_ops),
|
||||
nir_channel(&b, nir_load_local_invocation_id(&b), 0));
|
||||
|
||||
/* Convert from a "store size unit" into bytes. */
|
||||
store_address = nir_imul_imm(&b, store_address, 4 * inst_dwords[0]);
|
||||
|
||||
nir_def *load_address = store_address, *value = NULL, *values[num_mem_ops];
|
||||
|
||||
if (is_copy) {
|
||||
b.shader->info.num_ssbos++;
|
||||
if (is_clear) {
|
||||
value = nir_trim_vector(&b, nir_load_user_data_amd(&b), num_dwords_per_thread);
|
||||
} else {
|
||||
b.shader->info.cs.user_data_components_amd = inst_dwords[0];
|
||||
value = nir_trim_vector(&b, nir_load_user_data_amd(&b), inst_dwords[0]);
|
||||
value = nir_load_ssbo(&b, num_dwords_per_thread, 32, nir_imm_int(&b, 0), offset,
|
||||
.access = ACCESS_RESTRICT);
|
||||
}
|
||||
|
||||
/* Distance between a load and a store for latency hiding. */
|
||||
unsigned load_store_distance = is_copy ? 8 : 0;
|
||||
|
||||
for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
|
||||
int d = i - load_store_distance;
|
||||
|
||||
if (is_copy && i < num_mem_ops) {
|
||||
if (i) {
|
||||
load_address = nir_iadd(&b, load_address,
|
||||
nir_imm_int(&b, 4 * inst_dwords[i] * wg_size));
|
||||
}
|
||||
values[i] = nir_load_ssbo(&b, inst_dwords[i], 32, nir_imm_int(&b, 1), load_address,
|
||||
.access = ACCESS_RESTRICT);
|
||||
}
|
||||
|
||||
if (d >= 0) {
|
||||
if (d) {
|
||||
store_address = nir_iadd(&b, store_address,
|
||||
nir_imm_int(&b, 4 * inst_dwords[d] * wg_size));
|
||||
}
|
||||
nir_store_ssbo(&b, is_copy ? values[d] : value, nir_imm_int(&b, 0), store_address,
|
||||
.access = ACCESS_RESTRICT);
|
||||
}
|
||||
}
|
||||
nir_store_ssbo(&b, value, nir_imm_int(&b, !is_clear), offset, .access = ACCESS_RESTRICT);
|
||||
|
||||
return create_shader_state(sctx, b.shader);
|
||||
}
|
||||
|
||||
@@ -192,12 +192,12 @@ void si_test_dma_perf(struct si_screen *sscreen)
|
||||
info.grid[2] = 1;
|
||||
|
||||
struct pipe_shader_buffer sb[2] = {};
|
||||
sb[0].buffer = dst;
|
||||
sb[0].buffer_size = size;
|
||||
sb[is_copy].buffer = dst;
|
||||
sb[is_copy].buffer_size = size;
|
||||
|
||||
if (is_copy) {
|
||||
sb[1].buffer = src;
|
||||
sb[1].buffer_size = size;
|
||||
sb[0].buffer = src;
|
||||
sb[0].buffer_size = size;
|
||||
} else {
|
||||
for (unsigned i = 0; i < 4; i++)
|
||||
sctx->cs_user_data[i] = clear_value;
|
||||
|
||||
Reference in New Issue
Block a user