From 8845702f8cee01d74212a817c49fb4ccfcf3a90f Mon Sep 17 00:00:00 2001
From: Asahi Lina <lina@asahilina.net>
Date: Wed, 8 Nov 2023 22:27:24 +0900
Subject: [PATCH] asahi: Allocate scratch for shaders

Finally, now that scratch is available, tell the hardware how much we
need and allocate the scratch buffers as necessary.

We only do the max subgroup calculation for compute right now, and it's
probably conservative (we could probably calculate a reduced occupancy
depending on register pressure). Getting this too low won't break, the
GPU will just be less efficient if there is not enough scratch
allocated. Vertex and fragment just get the max 96 for now.

Signed-off-by: Asahi Lina <lina@asahilina.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27616>
---
 src/gallium/drivers/asahi/agx_state.c | 49 ++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c
index e3e191b546c..13171d5138f 100644
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -3079,7 +3079,8 @@ agx_update_descriptors(struct agx_batch *batch, struct agx_compiled_shader *cs,
 
 static uint32_t
 agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs,
-                   enum pipe_shader_type stage, unsigned variable_shared_mem)
+                   enum pipe_shader_type stage, unsigned variable_shared_mem,
+                   size_t max_subgroups)
 {
    struct agx_context *ctx = batch->ctx;
    struct agx_usc_builder b =
@@ -3132,9 +3133,34 @@ agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs,
       cfg.unk_2 = (stage == PIPE_SHADER_FRAGMENT) ? 2 : 3;
    }
 
+   uint32_t spill_bucket = 0;
+
+   if (cs->info.scratch_size > 0) {
+      spill_bucket = agx_scratch_get_bucket(cs->info.scratch_size);
+
+      switch (stage) {
+      case PIPE_SHADER_FRAGMENT:
+         agx_scratch_alloc(&ctx->scratch_fs, cs->info.scratch_size,
+                           max_subgroups);
+         batch->fs_scratch = true;
+         break;
+      case PIPE_SHADER_VERTEX:
+         agx_scratch_alloc(&ctx->scratch_vs, cs->info.scratch_size,
+                           max_subgroups);
+         batch->vs_scratch = true;
+         break;
+      default:
+         agx_scratch_alloc(&ctx->scratch_cs, cs->info.scratch_size,
+                           max_subgroups);
+         batch->cs_scratch = true;
+         break;
+      }
+   }
+
    agx_usc_pack(&b, REGISTERS, cfg) {
       cfg.register_count = cs->info.nr_gprs;
       cfg.unk_1 = (stage == PIPE_SHADER_FRAGMENT);
+      cfg.spill_size = spill_bucket;
    }
 
    if (stage == PIPE_SHADER_FRAGMENT) {
@@ -3538,7 +3564,7 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out, bool is_lines,
       }
 
       agx_push(out, VDM_STATE_VERTEX_SHADER_WORD_1, cfg) {
-         cfg.pipeline = agx_build_pipeline(batch, vs, PIPE_SHADER_VERTEX, 0);
+         cfg.pipeline = agx_build_pipeline(batch, vs, PIPE_SHADER_VERTEX, 0, 0);
       }
 
       agx_push(out, VDM_STATE_VERTEX_OUTPUTS, cfg) {
@@ -3725,7 +3751,7 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out, bool is_lines,
 
       agx_ppp_push(&ppp, FRAGMENT_SHADER, cfg) {
          cfg.pipeline =
-            agx_build_pipeline(batch, ctx->fs, PIPE_SHADER_FRAGMENT, 0),
+            agx_build_pipeline(batch, ctx->fs, PIPE_SHADER_FRAGMENT, 0, 0),
          cfg.uniform_register_count = ctx->fs->info.push_count;
          cfg.preshader_register_count = ctx->fs->info.nr_preamble_gprs;
          cfg.texture_state_register_count =
@@ -5093,6 +5119,19 @@ agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info,
    agx_update_descriptors(batch, cs, PIPE_SHADER_COMPUTE);
    agx_upload_uniforms(batch);
 
+   // TODO: This is broken.
+   size_t subgroups_per_core = 0;
+#if 0
+   if (!info->indirect) {
+      size_t subgroups_per_workgroup =
+         DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 32);
+      subgroups_per_core =
+         local_workgroups *
+         DIV_ROUND_UP(info->grid[0] * info->grid[1] * info->grid[2],
+                     ctx->scratch_cs.num_cores);
+   }
+#endif
+
    /* TODO: Ensure space if we allow multiple kernels in a batch */
    uint8_t *out = batch->cdm.current;
 
@@ -5108,8 +5147,8 @@ agx_launch(struct agx_batch *batch, const struct pipe_grid_info *info,
          agx_nr_tex_descriptors(batch, merged_stage(ctx, stage));
       cfg.sampler_state_register_count =
          translate_sampler_state_count(ctx, cs, stage);
-      cfg.pipeline =
-         agx_build_pipeline(batch, cs, stage, info->variable_shared_mem);
+      cfg.pipeline = agx_build_pipeline(
+         batch, cs, stage, info->variable_shared_mem, subgroups_per_core);
    }
 
    /* Added in G14X */