radeonsi/gfx11: optimize attribute stores
Reviewed-by: Mihai Preda <mhpreda@gmail.com> Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16509>
This commit is contained in:
@@ -1660,7 +1660,7 @@ void gfx10_ngg_build_end(struct si_shader_context *ctx)
|
||||
i++;
|
||||
}
|
||||
|
||||
si_llvm_build_vs_exports(ctx, outputs, i);
|
||||
si_llvm_build_vs_exports(ctx, NULL, outputs, i);
|
||||
}
|
||||
ac_build_endif(&ctx->ac, 6002);
|
||||
}
|
||||
@@ -2211,7 +2211,8 @@ void gfx10_ngg_gs_build_end(struct si_shader_context *ctx)
|
||||
ac_build_endif(&ctx->ac, 5140);
|
||||
|
||||
/* Export position and parameter data */
|
||||
tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, vertlive_scan.result_reduce, "");
|
||||
LLVMValueRef num_export_threads = vertlive_scan.result_reduce;
|
||||
tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_export_threads, "");
|
||||
ac_build_ifcc(&ctx->ac, tmp, 5145);
|
||||
{
|
||||
struct si_shader_output_values outputs[PIPE_MAX_SHADER_OUTPUTS];
|
||||
@@ -2233,7 +2234,7 @@ void gfx10_ngg_gs_build_end(struct si_shader_context *ctx)
|
||||
}
|
||||
}
|
||||
|
||||
si_llvm_build_vs_exports(ctx, outputs, info->num_outputs);
|
||||
si_llvm_build_vs_exports(ctx, num_export_threads, outputs, info->num_outputs);
|
||||
}
|
||||
ac_build_endif(&ctx->ac, 5145);
|
||||
}
|
||||
|
||||
@@ -272,7 +272,7 @@ void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef
|
||||
struct si_shader_output_values *shader_out);
|
||||
void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs,
|
||||
unsigned noutput, unsigned stream);
|
||||
void si_llvm_build_vs_exports(struct si_shader_context *ctx,
|
||||
void si_llvm_build_vs_exports(struct si_shader_context *ctx, LLVMValueRef num_export_threads,
|
||||
struct si_shader_output_values *outputs, unsigned noutput);
|
||||
void si_llvm_vs_build_end(struct si_shader_context *ctx);
|
||||
void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
|
||||
|
||||
@@ -606,7 +606,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
|
||||
}
|
||||
|
||||
if (stream == 0)
|
||||
si_llvm_build_vs_exports(&ctx, outputs, gsinfo->num_outputs);
|
||||
si_llvm_build_vs_exports(&ctx, NULL, outputs, gsinfo->num_outputs);
|
||||
|
||||
LLVMBuildBr(builder, end_bb);
|
||||
}
|
||||
|
||||
@@ -504,10 +504,13 @@ static void si_vertex_color_clamping(struct si_shader_context *ctx,
|
||||
}
|
||||
}
|
||||
|
||||
/* Generate export instructions for hardware VS shader stage or NGG GS stage
|
||||
/**
|
||||
* Generate export instructions for hardware VS shader stage or NGG GS stage
|
||||
* (position and parameter data only).
|
||||
*
|
||||
* \param num_export_threads The number of threads that are active for exports. Only used by gfx11.
|
||||
*/
|
||||
void si_llvm_build_vs_exports(struct si_shader_context *ctx,
|
||||
void si_llvm_build_vs_exports(struct si_shader_context *ctx, LLVMValueRef num_export_threads,
|
||||
struct si_shader_output_values *outputs, unsigned noutput)
|
||||
{
|
||||
struct si_shader *shader = ctx->shader;
|
||||
@@ -722,6 +725,30 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx,
|
||||
}
|
||||
|
||||
if (ctx->screen->info.gfx_level >= GFX11) {
|
||||
/* Store primitive exports to alloca variables, so that we can read them outside this branch. */
|
||||
for (unsigned i = 0; i < shader->info.nr_param_exports; i++) {
|
||||
for (unsigned chan = 0; chan < 4; chan++) {
|
||||
param_exports[i].out[chan] =
|
||||
ac_build_alloca_init(&ctx->ac, param_exports[i].out[chan], "");
|
||||
}
|
||||
}
|
||||
ac_build_endif(&ctx->ac, 0);
|
||||
|
||||
if (!num_export_threads)
|
||||
num_export_threads = si_unpack_param(ctx, ctx->args.merged_wave_info, 0, 8);
|
||||
|
||||
/* We should always store full vec4s in groups of 8 lanes for the best performance even if
|
||||
* some of them are garbage or have unused components, so align the number of export threads
|
||||
* to 8.
|
||||
*/
|
||||
num_export_threads = LLVMBuildAdd(ctx->ac.builder, num_export_threads,
|
||||
LLVMConstInt(ctx->ac.i32, 7, 0), "");
|
||||
num_export_threads = LLVMBuildAnd(ctx->ac.builder, num_export_threads,
|
||||
LLVMConstInt(ctx->ac.i32, ~7, 0), "");
|
||||
ac_build_ifcc(&ctx->ac,
|
||||
LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
|
||||
ac_get_thread_id(&ctx->ac), num_export_threads, ""), 0);
|
||||
|
||||
/* Get the attribute ring address and descriptor. */
|
||||
LLVMValueRef attr_address;
|
||||
if (ctx->stage == MESA_SHADER_VERTEX && shader->selector->info.base.vs.blit_sgprs_amd) {
|
||||
@@ -765,6 +792,11 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx,
|
||||
|
||||
/* Write attributes to the attribute ring buffer. */
|
||||
for (unsigned i = 0; i < shader->info.nr_param_exports; i++) {
|
||||
for (unsigned chan = 0; chan < 4; chan++) {
|
||||
param_exports[i].out[chan] =
|
||||
LLVMBuildLoad(ctx->ac.builder, param_exports[i].out[chan], "");
|
||||
}
|
||||
|
||||
LLVMValueRef vdata = ac_build_gather_values_extended(&ctx->ac, param_exports[i].out,
|
||||
4, 1, false);
|
||||
|
||||
@@ -812,7 +844,7 @@ void si_llvm_vs_build_end(struct si_shader_context *ctx)
|
||||
i++;
|
||||
}
|
||||
|
||||
si_llvm_build_vs_exports(ctx, outputs, i);
|
||||
si_llvm_build_vs_exports(ctx, NULL, outputs, i);
|
||||
FREE(outputs);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user