From 447d74483364f183b51b67b03e8e4ed2de9d5bf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 1 Jun 2025 18:47:05 -0400 Subject: [PATCH] ac/llvm: allocate LLVM PS output variables on demand This stops relying on si_shader_info, allowing further cleanup of si_shader_info. radv_load_output was unused. Reviewed-by: Qiang Yu Part-of: --- src/amd/llvm/ac_nir_to_llvm.c | 18 +++++--- src/amd/vulkan/radv_nir_to_llvm.c | 9 ---- src/gallium/drivers/radeonsi/si_shader_llvm.c | 22 --------- .../drivers/radeonsi/si_shader_llvm_ps.c | 45 ++++++++++++------- .../drivers/radeonsi/si_shader_llvm_tess.c | 7 ++- 5 files changed, 44 insertions(+), 57 deletions(-) diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index 0c7bf60cf1e..b7ccaa8338d 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -2047,10 +2047,18 @@ static void visit_store_output(struct ac_nir_context *ctx, nir_intrinsic_instr * continue; LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component); - LLVMValueRef output_addr = ctx->abi->outputs[base * 4 + chan]; + LLVMTypeRef val_type = LLVMTypeOf(value); + assert(val_type == ctx->ac.f32 || val_type == ctx->ac.f16); + LLVMTypeRef output_type = ctx->stage == MESA_SHADER_FRAGMENT ? val_type : ctx->ac.f32; + LLVMValueRef *output_addr = &ctx->abi->outputs[base * 4 + chan]; - if (!ctx->abi->is_16bit[base * 4 + chan] && - LLVMTypeOf(value) == ctx->ac.f16) { + /* Allocate the output variable on demand. */ + if (!*output_addr) { + *output_addr = ac_build_alloca_undef(&ctx->ac, output_type, ""); + ctx->abi->is_16bit[base * 4 + chan] = output_type == ctx->ac.f16; + } + + if (val_type == ctx->ac.f16 && output_type == ctx->ac.f32) { LLVMValueRef output, index; /* Insert the 16-bit value into the low or high bits of the 32-bit output @@ -2058,11 +2066,11 @@ static void visit_store_output(struct ac_nir_context *ctx, nir_intrinsic_instr * */ index = LLVMConstInt(ctx->ac.i32, nir_intrinsic_io_semantics(instr).high_16bits, 0); - output = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.v2f16, output_addr, ""); + output = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.v2f16, *output_addr, ""); output = LLVMBuildInsertElement(ctx->ac.builder, output, value, index, ""); value = LLVMBuildBitCast(ctx->ac.builder, output, ctx->ac.f32, ""); } - LLVMBuildStore(ctx->ac.builder, value, output_addr); + LLVMBuildStore(ctx->ac.builder, value, *output_addr); } } diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 226c84e62da..059d8795e7b 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -183,15 +183,6 @@ radv_get_sampler_desc(struct ac_shader_abi *abi, LLVMValueRef index, enum ac_des return radv_load_rsrc(ctx, index, v4 ? ctx->ac.v4i32 : ctx->ac.v8i32); } -static LLVMValueRef -radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan) -{ - int idx = ac_llvm_reg_index_soa(index, chan); - LLVMValueRef output = ctx->abi.outputs[idx]; - LLVMTypeRef type = ctx->abi.is_16bit[idx] ? ctx->ac.f16 : ctx->ac.f32; - return LLVMBuildLoad2(ctx->ac.builder, type, output, ""); -} - static void ac_llvm_finalize_module(struct radv_shader_context *ctx, struct ac_midend_optimizer *meo) { diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 224e8036662..165d13935e5 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -684,28 +684,6 @@ static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shade info->options & SI_PROFILE_CLAMP_DIV_BY_ZERO; ctx->abi.disable_aniso_single_level = true; - bool ls_need_output = - ctx->stage == MESA_SHADER_VERTEX && shader->key.ge.as_ls && - shader->key.ge.opt.same_patch_vertices; - - bool ps_need_output = ctx->stage == MESA_SHADER_FRAGMENT; - - if (ls_need_output || ps_need_output) { - for (unsigned i = 0; i < info->num_outputs; i++) { - LLVMTypeRef type = ctx->ac.f32; - - /* Only FS uses unpacked f16. Other stages pack 16-bit outputs into low and high bits of f32. */ - if (nir->info.stage == MESA_SHADER_FRAGMENT && - nir_alu_type_get_type_size(ctx->shader->selector->info.output_type[i]) == 16) - type = ctx->ac.f16; - - for (unsigned j = 0; j < 4; j++) { - ctx->abi.outputs[i * 4 + j] = ac_build_alloca_undef(&ctx->ac, type, ""); - ctx->abi.is_16bit[i * 4 + j] = type == ctx->ac.f16; - } - } - } - if (!ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args->ac, nir)) return false; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c index 8788df3be28..7ca9aa7ff6c 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c @@ -353,35 +353,38 @@ void si_llvm_ps_build_end(struct si_shader_context *ctx) struct si_shader_info *info = &shader->selector->info; LLVMBuilderRef builder = ctx->ac.builder; unsigned i, j, vgpr; - LLVMValueRef *addrs = ctx->abi.outputs; LLVMValueRef color[8][4] = {}; + uint8_t color_output_mask = 0, is_16bit_mask = 0; LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL; LLVMValueRef ret; /* Read the output values. */ for (i = 0; i < info->num_outputs; i++) { unsigned semantic = info->output_semantic[i]; - LLVMTypeRef type = ctx->abi.is_16bit[4 * i] ? ctx->ac.f16 : ctx->ac.f32; switch (semantic) { case FRAG_RESULT_DEPTH: - depth = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], ""); + depth = LLVMBuildLoad2(builder, ctx->ac.f32, ctx->abi.outputs[4 * i + 0], ""); break; case FRAG_RESULT_STENCIL: - stencil = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], ""); + stencil = LLVMBuildLoad2(builder, ctx->ac.f32, ctx->abi.outputs[4 * i + 0], ""); break; case FRAG_RESULT_SAMPLE_MASK: - samplemask = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], ""); + samplemask = LLVMBuildLoad2(builder, ctx->ac.f32, ctx->abi.outputs[4 * i + 0], ""); break; default: if (semantic >= FRAG_RESULT_DATA0 && semantic <= FRAG_RESULT_DATA7) { unsigned index = semantic - FRAG_RESULT_DATA0; for (j = 0; j < 4; j++) { - LLVMValueRef ptr = addrs[4 * i + j]; - type = ctx->abi.is_16bit[4 * i + j] ? ctx->ac.f16 : ctx->ac.f32; - LLVMValueRef result = LLVMBuildLoad2(builder, type, ptr, ""); + if (!ctx->abi.outputs[4 * i + j]) + continue; + + color_output_mask |= BITFIELD_BIT(index); + is_16bit_mask |= ctx->abi.is_16bit[4 * i + j] ? BITFIELD_BIT(index) : 0; + LLVMTypeRef type = ctx->abi.is_16bit[4 * i + j] ? ctx->ac.f16 : ctx->ac.f32; + LLVMValueRef result = LLVMBuildLoad2(builder, type, ctx->abi.outputs[4 * i + j], ""); color[index][j] = result; } } else { @@ -401,20 +404,28 @@ void si_llvm_ps_build_end(struct si_shader_context *ctx) /* Set VGPRs */ vgpr = SI_SGPR_ALPHA_REF + 1; - for (i = 0; i < ARRAY_SIZE(color); i++) { - if (!color[i][0]) - continue; - if (LLVMTypeOf(color[i][0]) == ctx->ac.f16) { + u_foreach_bit(i, color_output_mask) { + if (is_16bit_mask & BITFIELD_BIT(i)) { for (j = 0; j < 2; j++) { - LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, &color[i][j * 2], 2); - tmp = LLVMBuildBitCast(builder, tmp, ctx->ac.f32, ""); - ret = LLVMBuildInsertValue(builder, ret, tmp, vgpr++, ""); + if (color[i][j * 2] || color[i][j * 2 + 1]) { + for (unsigned k = 0; k < 2; k++) { + if (!color[i][j * 2 + k]) + color[i][j * 2 + k] = LLVMGetUndef(ctx->ac.f16); + } + LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, &color[i][j * 2], 2); + tmp = LLVMBuildBitCast(builder, tmp, ctx->ac.f32, ""); + ret = LLVMBuildInsertValue(builder, ret, tmp, vgpr, ""); + } + vgpr++; } vgpr += 2; } else { - for (j = 0; j < 4; j++) - ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, ""); + for (j = 0; j < 4; j++) { + if (color[i][j]) + ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr, ""); + vgpr++; + } } } if (depth) diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index f88fb2d9173..f9b02be3f63 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -74,7 +74,6 @@ void si_llvm_ls_build_end(struct si_shader_context *ctx) assert(shader->is_monolithic); struct si_shader_info *info = &shader->selector->info; - LLVMValueRef *addrs = ctx->abi.outputs; for (unsigned i = 0; i < info->num_outputs; i++) { unsigned semantic = info->output_semantic[i]; @@ -84,11 +83,11 @@ void si_llvm_ls_build_end(struct si_shader_context *ctx) continue; for (unsigned chan = 0; chan < 4; chan++) { - if (!(info->output_usagemask[i] & (1 << chan))) + if (!ctx->abi.outputs[4 * i + chan]) continue; - LLVMValueRef value = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.f32, addrs[4 * i + chan], ""); - + LLVMValueRef value = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.f32, + ctx->abi.outputs[4 * i + chan], ""); ret = LLVMBuildInsertValue(ctx->ac.builder, ret, value, vgpr + param * 4 + chan, ""); } }