ac/llvm: allocate LLVM PS output variables on demand

This stops relying on si_shader_info, allowing further cleanup of si_shader_info. radv_load_output was unused. Reviewed-by: Qiang Yu <yuq825@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35392>
2025-06-01 18:47:05 -04:00
parent 6b2331d5f7
commit 447d744833
5 changed files with 44 additions and 57 deletions
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -2047,10 +2047,18 @@ static void visit_store_output(struct ac_nir_context *ctx, nir_intrinsic_instr *
         continue;

      LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
-      LLVMValueRef output_addr = ctx->abi->outputs[base * 4 + chan];
+      LLVMTypeRef val_type = LLVMTypeOf(value);
+      assert(val_type == ctx->ac.f32 || val_type == ctx->ac.f16);
+      LLVMTypeRef output_type = ctx->stage == MESA_SHADER_FRAGMENT ? val_type : ctx->ac.f32;
+      LLVMValueRef *output_addr = &ctx->abi->outputs[base * 4 + chan];

-      if (!ctx->abi->is_16bit[base * 4 + chan] &&
-          LLVMTypeOf(value) == ctx->ac.f16) {
+      /* Allocate the output variable on demand. */
+      if (!*output_addr) {
+         *output_addr = ac_build_alloca_undef(&ctx->ac, output_type, "");
+         ctx->abi->is_16bit[base * 4 + chan] = output_type == ctx->ac.f16;
+      }
+
+      if (val_type == ctx->ac.f16 && output_type == ctx->ac.f32) {
         LLVMValueRef output, index;

         /* Insert the 16-bit value into the low or high bits of the 32-bit output
@@ -2058,11 +2066,11 @@ static void visit_store_output(struct ac_nir_context *ctx, nir_intrinsic_instr *
          */
         index = LLVMConstInt(ctx->ac.i32, nir_intrinsic_io_semantics(instr).high_16bits, 0);

-         output = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.v2f16, output_addr, "");
+         output = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.v2f16, *output_addr, "");
         output = LLVMBuildInsertElement(ctx->ac.builder, output, value, index, "");
         value = LLVMBuildBitCast(ctx->ac.builder, output, ctx->ac.f32, "");
      }
-      LLVMBuildStore(ctx->ac.builder, value, output_addr);
+      LLVMBuildStore(ctx->ac.builder, value, *output_addr);
   }
 }

--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -183,15 +183,6 @@ radv_get_sampler_desc(struct ac_shader_abi *abi, LLVMValueRef index, enum ac_des
   return radv_load_rsrc(ctx, index, v4 ? ctx->ac.v4i32 : ctx->ac.v8i32);
 }

-static LLVMValueRef
-radv_load_output(struct radv_shader_context *ctx, unsigned index, unsigned chan)
-{
-   int idx = ac_llvm_reg_index_soa(index, chan);
-   LLVMValueRef output = ctx->abi.outputs[idx];
-   LLVMTypeRef type = ctx->abi.is_16bit[idx] ? ctx->ac.f16 : ctx->ac.f32;
-   return LLVMBuildLoad2(ctx->ac.builder, type, output, "");
-}
-
 static void
 ac_llvm_finalize_module(struct radv_shader_context *ctx, struct ac_midend_optimizer *meo)
 {
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -684,28 +684,6 @@ static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shade
                                info->options & SI_PROFILE_CLAMP_DIV_BY_ZERO;
   ctx->abi.disable_aniso_single_level = true;

-   bool ls_need_output =
-      ctx->stage == MESA_SHADER_VERTEX && shader->key.ge.as_ls &&
-      shader->key.ge.opt.same_patch_vertices;
-
-   bool ps_need_output = ctx->stage == MESA_SHADER_FRAGMENT;
-
-   if (ls_need_output || ps_need_output) {
-      for (unsigned i = 0; i < info->num_outputs; i++) {
-         LLVMTypeRef type = ctx->ac.f32;
-
-         /* Only FS uses unpacked f16. Other stages pack 16-bit outputs into low and high bits of f32. */
-         if (nir->info.stage == MESA_SHADER_FRAGMENT &&
-             nir_alu_type_get_type_size(ctx->shader->selector->info.output_type[i]) == 16)
-            type = ctx->ac.f16;
-
-         for (unsigned j = 0; j < 4; j++) {
-            ctx->abi.outputs[i * 4 + j] = ac_build_alloca_undef(&ctx->ac, type, "");
-            ctx->abi.is_16bit[i * 4 + j] = type == ctx->ac.f16;
-         }
-      }
-   }
-
   if (!ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args->ac, nir))
      return false;

--- a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
@@ -353,35 +353,38 @@ void si_llvm_ps_build_end(struct si_shader_context *ctx)
   struct si_shader_info *info = &shader->selector->info;
   LLVMBuilderRef builder = ctx->ac.builder;
   unsigned i, j, vgpr;
-   LLVMValueRef *addrs = ctx->abi.outputs;

   LLVMValueRef color[8][4] = {};
+   uint8_t color_output_mask = 0, is_16bit_mask = 0;
   LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
   LLVMValueRef ret;

   /* Read the output values. */
   for (i = 0; i < info->num_outputs; i++) {
      unsigned semantic = info->output_semantic[i];
-      LLVMTypeRef type = ctx->abi.is_16bit[4 * i] ? ctx->ac.f16 : ctx->ac.f32;

      switch (semantic) {
      case FRAG_RESULT_DEPTH:
-         depth = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
+         depth = LLVMBuildLoad2(builder, ctx->ac.f32, ctx->abi.outputs[4 * i + 0], "");
         break;
      case FRAG_RESULT_STENCIL:
-         stencil = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
+         stencil = LLVMBuildLoad2(builder, ctx->ac.f32, ctx->abi.outputs[4 * i + 0], "");
         break;
      case FRAG_RESULT_SAMPLE_MASK:
-         samplemask = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
+         samplemask = LLVMBuildLoad2(builder, ctx->ac.f32, ctx->abi.outputs[4 * i + 0], "");
         break;
      default:
         if (semantic >= FRAG_RESULT_DATA0 && semantic <= FRAG_RESULT_DATA7) {
            unsigned index = semantic - FRAG_RESULT_DATA0;

            for (j = 0; j < 4; j++) {
-               LLVMValueRef ptr = addrs[4 * i + j];
-               type = ctx->abi.is_16bit[4 * i + j] ? ctx->ac.f16 : ctx->ac.f32;
-               LLVMValueRef result = LLVMBuildLoad2(builder, type, ptr, "");
+               if (!ctx->abi.outputs[4 * i + j])
+                  continue;
+
+               color_output_mask |= BITFIELD_BIT(index);
+               is_16bit_mask |= ctx->abi.is_16bit[4 * i + j] ? BITFIELD_BIT(index) : 0;
+               LLVMTypeRef type = ctx->abi.is_16bit[4 * i + j] ? ctx->ac.f16 : ctx->ac.f32;
+               LLVMValueRef result = LLVMBuildLoad2(builder, type, ctx->abi.outputs[4 * i + j], "");
               color[index][j] = result;
            }
         } else {
@@ -401,20 +404,28 @@ void si_llvm_ps_build_end(struct si_shader_context *ctx)

   /* Set VGPRs */
   vgpr = SI_SGPR_ALPHA_REF + 1;
-   for (i = 0; i < ARRAY_SIZE(color); i++) {
-      if (!color[i][0])
-         continue;

-      if (LLVMTypeOf(color[i][0]) == ctx->ac.f16) {
+   u_foreach_bit(i, color_output_mask) {
+      if (is_16bit_mask & BITFIELD_BIT(i)) {
         for (j = 0; j < 2; j++) {
-            LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, &color[i][j * 2], 2);
-            tmp = LLVMBuildBitCast(builder, tmp, ctx->ac.f32, "");
-            ret = LLVMBuildInsertValue(builder, ret, tmp, vgpr++, "");
+            if (color[i][j * 2] || color[i][j * 2 + 1]) {
+               for (unsigned k = 0; k < 2; k++) {
+                  if (!color[i][j * 2 + k])
+                     color[i][j * 2 + k] = LLVMGetUndef(ctx->ac.f16);
+               }
+               LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, &color[i][j * 2], 2);
+               tmp = LLVMBuildBitCast(builder, tmp, ctx->ac.f32, "");
+               ret = LLVMBuildInsertValue(builder, ret, tmp, vgpr, "");
+            }
+            vgpr++;
         }
         vgpr += 2;
      } else {
-         for (j = 0; j < 4; j++)
-            ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
+         for (j = 0; j < 4; j++) {
+            if (color[i][j])
+               ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr, "");
+            vgpr++;
+         }
      }
   }
   if (depth)
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
@@ -74,7 +74,6 @@ void si_llvm_ls_build_end(struct si_shader_context *ctx)
      assert(shader->is_monolithic);

      struct si_shader_info *info = &shader->selector->info;
-      LLVMValueRef *addrs = ctx->abi.outputs;

      for (unsigned i = 0; i < info->num_outputs; i++) {
         unsigned semantic = info->output_semantic[i];
@@ -84,11 +83,11 @@ void si_llvm_ls_build_end(struct si_shader_context *ctx)
            continue;

         for (unsigned chan = 0; chan < 4; chan++) {
-            if (!(info->output_usagemask[i] & (1 << chan)))
+            if (!ctx->abi.outputs[4 * i + chan])
               continue;

-            LLVMValueRef value = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.f32, addrs[4 * i + chan], "");
-
+            LLVMValueRef value = LLVMBuildLoad2(ctx->ac.builder, ctx->ac.f32,
+                                                ctx->abi.outputs[4 * i + chan], "");
            ret = LLVMBuildInsertValue(ctx->ac.builder, ret, value, vgpr + param * 4 + chan, "");
         }
      }