From 447d74483364f183b51b67b03e8e4ed2de9d5bf3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 1 Jun 2025 18:47:05 -0400
Subject: [PATCH] ac/llvm: allocate LLVM PS output variables on demand

This stops relying on si_shader_info, allowing further cleanup of
si_shader_info.

radv_load_output was unused.

Reviewed-by: Qiang Yu <yuq825@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35392>
---
 src/amd/llvm/ac_nir_to_llvm.c                 | 18 +++++---
 src/amd/vulkan/radv_nir_to_llvm.c             |  9 ----
 src/gallium/drivers/radeonsi/si_shader_llvm.c | 22 ---------
 .../drivers/radeonsi/si_shader_llvm_ps.c      | 45 ++++++++++++-------
 .../drivers/radeonsi/si_shader_llvm_tess.c    |  7 ++-
 5 files changed, 44 insertions(+), 57 deletions(-)

diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c
index 0c7bf60cf1e..b7ccaa8338d 100644
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -2047,10 +2047,18 @@ static void visit_store_output(struct ac_nir_context *ctx, nir_intrinsic_instr *
          continue;
 
       LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
-      LLVMValueRef output_addr = ctx->abi->outputs[base * 4 + chan];
+      LLVMTypeRef val_type = LLVMTypeOf(value);
+      assert(val_type == ctx->ac.f32 || val_type == ctx->ac.f16);
+      LLVMTypeRef output_type = ctx->stage == MESA_SHADER_FRAGMENT ? val_type : ctx->ac.f32;
+      LLVMValueRef *output_addr = &ctx->abi->outputs[base * 4 + chan];
 
-      if (!ctx->abi->is_16bit[base * 4 + chan] &&
-          LLVMTypeOf(value) == ctx->ac.f16) {
+      /* Allocate the output variable on demand. */
+      if (!*output_addr) {
+         *output_addr = ac_build_alloca_undef(&ctx->ac, output_type, "");
+         ctx->abi->is_16bit[base * 4 + chan] = output_type == ctx->ac.f16;
+      }
+
+      if (val_type == ctx->ac.f16 && output_type == ctx->ac.f32) {
          LLVMValueRef output, index;
 
          /* Insert the 16-bit value into the low or high bits of the 32-bit output
@@ -2058,11 +2066,11 @@ static void visit_store_output(struct ac_nir_context *ctx, nir_intrinsic_instr *
           */
          index = LLVMConstInt(ctx->ac.i32, nir_intrinsic_io_semantics(instr).high_16bits, 0);
 
-         output = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.v2f16, output_addr, "");
+         output = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.v2f16, *output_addr, "");
          output = LLVMBuildInsertElement(ctx->ac.builder, output, value, index, "");
          value = LLVMBuildBitCast(ctx->ac.builder, output, ctx->ac.f32, "");
       }
-      LLVMBuildStore(ctx->ac.builder, value, output_addr);
+      LLVMBuildStore(ctx->ac.builder, value, *output_addr);
    }
 }
 
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index 226c84e62da..059d8795e7b 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -183,15 +183,6 @@ radv_get_sampler_desc(struct ac_shader_abi *abi, LLVMValueRef index, enum ac_des
    return radv_load_rsrc(ctx, index, v4 ? ctx->ac.v4i32 : ctx->ac.v8i32);
 }
 
-static LLVMValueRef
-radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan)
-{
-   int idx = ac_llvm_reg_index_soa(index, chan);
-   LLVMValueRef output = ctx->abi.outputs[idx];
-   LLVMTypeRef type = ctx->abi.is_16bit[idx] ? ctx->ac.f16 : ctx->ac.f32;
-   return LLVMBuildLoad2(ctx->ac.builder, type, output, "");
-}
-
 static void
 ac_llvm_finalize_module(struct radv_shader_context *ctx, struct ac_midend_optimizer *meo)
 {
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c
index 224e8036662..165d13935e5 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -684,28 +684,6 @@ static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shade
                                 info->options & SI_PROFILE_CLAMP_DIV_BY_ZERO;
    ctx->abi.disable_aniso_single_level = true;
 
-   bool ls_need_output =
-      ctx->stage == MESA_SHADER_VERTEX && shader->key.ge.as_ls &&
-      shader->key.ge.opt.same_patch_vertices;
-
-   bool ps_need_output = ctx->stage == MESA_SHADER_FRAGMENT;
-
-   if (ls_need_output || ps_need_output) {
-      for (unsigned i = 0; i < info->num_outputs; i++) {
-         LLVMTypeRef type = ctx->ac.f32;
-
-         /* Only FS uses unpacked f16. Other stages pack 16-bit outputs into low and high bits of f32. */
-         if (nir->info.stage == MESA_SHADER_FRAGMENT &&
-             nir_alu_type_get_type_size(ctx->shader->selector->info.output_type[i]) == 16)
-            type = ctx->ac.f16;
-
-         for (unsigned j = 0; j < 4; j++) {
-            ctx->abi.outputs[i * 4 + j] = ac_build_alloca_undef(&ctx->ac, type, "");
-            ctx->abi.is_16bit[i * 4 + j] = type == ctx->ac.f16;
-         }
-      }
-   }
-
    if (!ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args->ac, nir))
       return false;
 
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
index 8788df3be28..7ca9aa7ff6c 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
@@ -353,35 +353,38 @@ void si_llvm_ps_build_end(struct si_shader_context *ctx)
    struct si_shader_info *info = &shader->selector->info;
    LLVMBuilderRef builder = ctx->ac.builder;
    unsigned i, j, vgpr;
-   LLVMValueRef *addrs = ctx->abi.outputs;
 
    LLVMValueRef color[8][4] = {};
+   uint8_t color_output_mask = 0, is_16bit_mask = 0;
    LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
    LLVMValueRef ret;
 
    /* Read the output values. */
    for (i = 0; i < info->num_outputs; i++) {
       unsigned semantic = info->output_semantic[i];
-      LLVMTypeRef type = ctx->abi.is_16bit[4 * i] ? ctx->ac.f16 : ctx->ac.f32;
 
       switch (semantic) {
       case FRAG_RESULT_DEPTH:
-         depth = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
+         depth = LLVMBuildLoad2(builder, ctx->ac.f32, ctx->abi.outputs[4 * i + 0], "");
          break;
       case FRAG_RESULT_STENCIL:
-         stencil = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
+         stencil = LLVMBuildLoad2(builder, ctx->ac.f32, ctx->abi.outputs[4 * i + 0], "");
          break;
       case FRAG_RESULT_SAMPLE_MASK:
-         samplemask = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
+         samplemask = LLVMBuildLoad2(builder, ctx->ac.f32, ctx->abi.outputs[4 * i + 0], "");
          break;
       default:
          if (semantic >= FRAG_RESULT_DATA0 && semantic <= FRAG_RESULT_DATA7) {
             unsigned index = semantic - FRAG_RESULT_DATA0;
 
             for (j = 0; j < 4; j++) {
-               LLVMValueRef ptr = addrs[4 * i + j];
-               type = ctx->abi.is_16bit[4 * i + j] ? ctx->ac.f16 : ctx->ac.f32;
-               LLVMValueRef result = LLVMBuildLoad2(builder, type, ptr, "");
+               if (!ctx->abi.outputs[4 * i + j])
+                  continue;
+
+               color_output_mask |= BITFIELD_BIT(index);
+               is_16bit_mask |= ctx->abi.is_16bit[4 * i + j] ? BITFIELD_BIT(index) : 0;
+               LLVMTypeRef type = ctx->abi.is_16bit[4 * i + j] ? ctx->ac.f16 : ctx->ac.f32;
+               LLVMValueRef result = LLVMBuildLoad2(builder, type, ctx->abi.outputs[4 * i + j], "");
                color[index][j] = result;
             }
          } else {
@@ -401,20 +404,28 @@ void si_llvm_ps_build_end(struct si_shader_context *ctx)
 
    /* Set VGPRs */
    vgpr = SI_SGPR_ALPHA_REF + 1;
-   for (i = 0; i < ARRAY_SIZE(color); i++) {
-      if (!color[i][0])
-         continue;
 
-      if (LLVMTypeOf(color[i][0]) == ctx->ac.f16) {
+   u_foreach_bit(i, color_output_mask) {
+      if (is_16bit_mask & BITFIELD_BIT(i)) {
          for (j = 0; j < 2; j++) {
-            LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, &color[i][j * 2], 2);
-            tmp = LLVMBuildBitCast(builder, tmp, ctx->ac.f32, "");
-            ret = LLVMBuildInsertValue(builder, ret, tmp, vgpr++, "");
+            if (color[i][j * 2] || color[i][j * 2 + 1]) {
+               for (unsigned k = 0; k < 2; k++) {
+                  if (!color[i][j * 2 + k])
+                     color[i][j * 2 + k] = LLVMGetUndef(ctx->ac.f16);
+               }
+               LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, &color[i][j * 2], 2);
+               tmp = LLVMBuildBitCast(builder, tmp, ctx->ac.f32, "");
+               ret = LLVMBuildInsertValue(builder, ret, tmp, vgpr, "");
+            }
+            vgpr++;
          }
          vgpr += 2;
       } else {
-         for (j = 0; j < 4; j++)
-            ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
+         for (j = 0; j < 4; j++) {
+            if (color[i][j])
+               ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr, "");
+            vgpr++;
+         }
       }
    }
    if (depth)
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
index f88fb2d9173..f9b02be3f63 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
@@ -74,7 +74,6 @@ void si_llvm_ls_build_end(struct si_shader_context *ctx)
       assert(shader->is_monolithic);
 
       struct si_shader_info *info = &shader->selector->info;
-      LLVMValueRef *addrs = ctx->abi.outputs;
 
       for (unsigned i = 0; i < info->num_outputs; i++) {
          unsigned semantic = info->output_semantic[i];
@@ -84,11 +83,11 @@ void si_llvm_ls_build_end(struct si_shader_context *ctx)
             continue;
 
          for (unsigned chan = 0; chan < 4; chan++) {
-            if (!(info->output_usagemask[i] & (1 << chan)))
+            if (!ctx->abi.outputs[4 * i + chan])
                continue;
 
-            LLVMValueRef value = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.f32, addrs[4 * i + chan], "");
-
+            LLVMValueRef value = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.f32,
+                                                ctx->abi.outputs[4 * i + chan], "");
             ret = LLVMBuildInsertValue(ctx->ac.builder, ret, value, vgpr + param * 4 + chan, "");
          }
       }