From 1190808eca6c23d18ff1c2ba7abc99601b221bb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 14 Nov 2020 01:12:29 -0500 Subject: [PATCH] radeonsi: if VS and TCS have the same number of threads, merge the conditonals Instead of: if (VS) { VS; } if (TCS) { TCS; } Do this if the number of threads is the same in VS and TCS: exec = enabled_threads; VS; TCS; Skipping declare_vb_descriptor_input_sgprs is needed to match the VS return values. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_pipe.h | 1 + src/gallium/drivers/radeonsi/si_shader.c | 18 +++++---- src/gallium/drivers/radeonsi/si_shader.h | 4 ++ .../drivers/radeonsi/si_shader_internal.h | 2 +- src/gallium/drivers/radeonsi/si_shader_llvm.c | 26 +++++++++---- .../drivers/radeonsi/si_shader_llvm_ps.c | 2 +- src/gallium/drivers/radeonsi/si_state_draw.c | 37 +++++++++++++------ .../drivers/radeonsi/si_state_shaders.c | 1 + 8 files changed, 62 insertions(+), 29 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 8b7e8f2a19a..e7d575308fd 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1111,6 +1111,7 @@ struct si_context { bool ls_vgpr_fix : 1; bool prim_discard_cs_instancing : 1; bool ngg : 1; + bool same_patch_vertices : 1; uint8_t ngg_culling; int last_index_size; int last_base_vertex; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index fbeed398a28..ec748c92ce9 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -464,7 +464,8 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader) ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets); ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout); - declare_vb_descriptor_input_sgprs(ctx); + if (ctx->stage == MESA_SHADER_VERTEX) + declare_vb_descriptor_input_sgprs(ctx); /* VGPRs (first TCS, then VS) */ ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id); @@ -1212,6 +1213,8 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f) fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode); fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n", key->mono.u.ff_tcs_inputs_to_copy); + fprintf(f, " opt.prefer_mono = %u\n", key->opt.prefer_mono); + fprintf(f, " opt.same_patch_vertices = %u\n", key->opt.same_patch_vertices); break; case MESA_SHADER_TESS_EVAL: @@ -1733,7 +1736,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com } parts[num_parts++] = main_fn; - si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0); + si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0, false); if (ctx.shader->key.opt.vs_as_prim_discard_cs) si_build_prim_discard_compute_shader(&ctx); @@ -1743,7 +1746,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com parts[0] = ngg_cull_main_fn; parts[1] = ctx.main_fn; - si_build_wrapper_function(&ctx, parts, 2, 0, 0); + si_build_wrapper_function(&ctx, parts, 2, 0, 0, false); } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_CTRL) { if (sscreen->info.chip_class >= GFX9) { struct si_shader_selector *ls = shader->key.part.tcs.ls; @@ -1792,7 +1795,8 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com ctx.stage = MESA_SHADER_TESS_CTRL; si_build_wrapper_function(&ctx, parts + !vs_needs_prolog, 4 - !vs_needs_prolog, - vs_needs_prolog, vs_needs_prolog ? 2 : 1); + vs_needs_prolog, vs_needs_prolog ? 2 : 1, + shader->key.opt.same_patch_vertices); } else { LLVMValueRef parts[2]; union si_shader_part_key epilog_key; @@ -1804,7 +1808,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com si_llvm_build_tcs_epilog(&ctx, &epilog_key); parts[1] = ctx.main_fn; - si_build_wrapper_function(&ctx, parts, 2, 0, 0); + si_build_wrapper_function(&ctx, parts, 2, 0, 0, false); } } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_GEOMETRY) { if (ctx.screen->info.chip_class >= GFX9) { @@ -1866,7 +1870,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com parts[next_first_part = num_parts++] = gs_prolog; parts[num_parts++] = gs_main; - si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part); + si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part, false); } else { LLVMValueRef parts[2]; union si_shader_part_key prolog_key; @@ -1878,7 +1882,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com si_llvm_build_gs_prolog(&ctx, &prolog_key); parts[0] = ctx.main_fn; - si_build_wrapper_function(&ctx, parts, 2, 1, 0); + si_build_wrapper_function(&ctx, parts, 2, 1, 0, false); } } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_FRAGMENT) { si_llvm_build_monolithic_ps(&ctx, shader); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 1fe926338a5..c9aa439b740 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -673,6 +673,10 @@ struct si_shader_key { unsigned cs_cull_back : 1; unsigned cs_cull_z : 1; unsigned cs_halfz_clip_space : 1; + + /* VS and TCS have the same number of patch vertices. */ + unsigned same_patch_vertices:1; + unsigned inline_uniforms:1; uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS]; diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 6722e581415..38c5ab94dab 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -252,7 +252,7 @@ void si_llvm_declare_compute_memory(struct si_shader_context *ctx); bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir); void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts, unsigned num_parts, unsigned main_part, - unsigned next_shader_first_part); + unsigned next_shader_first_part, bool same_thread_count); /* si_shader_llvm_gs.c */ LLVMValueRef si_is_es_thread(struct si_shader_context *ctx); diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 3a69d1d9006..bb13d6d6fc7 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -456,7 +456,7 @@ bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir) */ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts, unsigned num_parts, unsigned main_part, - unsigned next_shader_first_part) + unsigned next_shader_first_part, bool same_thread_count) { LLVMBuilderRef builder = ctx->ac.builder; /* PS epilog has one arg per color component; gfx9 merged shader @@ -559,7 +559,7 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part si_llvm_create_func(ctx, "wrapper", returns, num_returns, si_get_max_workgroup_size(ctx->shader)); - if (si_is_merged_shader(ctx->shader)) + if (si_is_merged_shader(ctx->shader) && !same_thread_count) ac_init_exec_full_mask(&ctx->ac); /* Record the arguments of the function as if they were an output of @@ -618,11 +618,19 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part /* Merged shaders are executed conditionally depending * on the number of enabled threads passed in the input SGPRs. */ if (si_is_multi_part_shader(ctx->shader) && part == 0) { - LLVMValueRef ena, count = initial[3]; + if (same_thread_count) { + struct ac_arg arg; + arg.arg_index = 3; + arg.used = true; - count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), ""); - ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, ""); - ac_build_ifcc(&ctx->ac, ena, 6506); + si_init_exec_from_input(ctx, arg, 0); + } else { + LLVMValueRef ena, count = initial[3]; + + count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), ""); + ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, ""); + ac_build_ifcc(&ctx->ac, ena, 6506); + } } /* Derive arguments for the next part from outputs of the @@ -675,7 +683,8 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part ret = ac_build_call(&ctx->ac, parts[part], in, num_params); - if (si_is_multi_part_shader(ctx->shader) && part + 1 == next_shader_first_part) { + if (!same_thread_count && + si_is_multi_part_shader(ctx->shader) && part + 1 == next_shader_first_part) { ac_build_endif(&ctx->ac, 6506); /* The second half of the merged shader should use @@ -729,7 +738,8 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part } /* Close the conditional wrapping the second shader. */ - if (ctx->stage == MESA_SHADER_TESS_CTRL && si_is_multi_part_shader(ctx->shader)) + if (ctx->stage == MESA_SHADER_TESS_CTRL && + !same_thread_count && si_is_multi_part_shader(ctx->shader)) ac_build_endif(&ctx->ac, 6507); assert(LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind); diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c index af662eb0ebb..dd5f64716b0 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c @@ -988,7 +988,7 @@ void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, struct si_shader si_llvm_build_ps_epilog(ctx, &epilog_key); parts[num_parts++] = ctx->main_fn; - si_build_wrapper_function(ctx, parts, num_parts, main_index, 0); + si_build_wrapper_function(ctx, parts, num_parts, main_index, 0, false); } void si_llvm_init_ps_callbacks(struct si_shader_context *ctx) diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index beff65f3786..8a72921b87a 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1890,22 +1890,35 @@ static void si_draw_vbo(struct pipe_context *ctx, sctx->do_update_shaders = true; } - if (sctx->tes_shader.cso && sctx->screen->info.has_ls_vgpr_init_bug) { - /* Determine whether the LS VGPR fix should be applied. - * - * It is only required when num input CPs > num output CPs, - * which cannot happen with the fixed function TCS. We should - * also update this bit when switching from TCS to fixed - * function TCS. - */ + if (sctx->tes_shader.cso) { struct si_shader_selector *tcs = sctx->tcs_shader.cso; - bool ls_vgpr_fix = - tcs && info->vertices_per_patch > tcs->info.base.tess.tcs_vertices_out; - if (ls_vgpr_fix != sctx->ls_vgpr_fix) { - sctx->ls_vgpr_fix = ls_vgpr_fix; + /* The rarely occuring tcs == NULL case is not optimized. */ + bool same_patch_vertices = + sctx->chip_class >= GFX9 && + tcs && info->vertices_per_patch == tcs->info.base.tess.tcs_vertices_out; + + if (sctx->same_patch_vertices != same_patch_vertices) { + sctx->same_patch_vertices = same_patch_vertices; sctx->do_update_shaders = true; } + + if (sctx->screen->info.has_ls_vgpr_init_bug) { + /* Determine whether the LS VGPR fix should be applied. + * + * It is only required when num input CPs > num output CPs, + * which cannot happen with the fixed function TCS. We should + * also update this bit when switching from TCS to fixed + * function TCS. + */ + bool ls_vgpr_fix = + tcs && info->vertices_per_patch > tcs->info.base.tess.tcs_vertices_out; + + if (ls_vgpr_fix != sctx->ls_vgpr_fix) { + sctx->ls_vgpr_fix = ls_vgpr_fix; + sctx->do_update_shaders = true; + } + } } if (sctx->chip_class <= GFX9 && sctx->gs_shader.cso) { diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 8ea69056240..cd2168534a9 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1884,6 +1884,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh * The LS VGPR fix prefers this too. */ key->opt.prefer_mono = 1; + key->opt.same_patch_vertices = sctx->same_patch_vertices; } key->part.tcs.epilog.prim_mode =