From b9a359e9bde48bbdbc336f22ad2e2b0ffe5b227f Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Date: Tue, 5 Mar 2024 15:40:58 -0400
Subject: [PATCH] agx: start a crude cycle model

based on notes by Dougall Johnson and Philip Turner.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28483>
---
 src/asahi/compiler/agx_compile.c     | 16 ++++--
 src/asahi/compiler/agx_compiler.h    | 15 +++++
 src/asahi/compiler/agx_performance.c | 83 ++++++++++++++++++++++++++++
 3 files changed, 108 insertions(+), 6 deletions(-)

diff --git a/src/asahi/compiler/agx_compile.c b/src/asahi/compiler/agx_compile.c
index ea31fa3b4cd..8d6054733d5 100644
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@@ -2319,15 +2319,19 @@ agx_dump_stats(agx_context *ctx, unsigned size, char **out)
          fills++;
    }
 
+   struct agx_cycle_estimate cycles = agx_estimate_cycles(ctx);
+
    unsigned nr_threads =
       agx_occupancy_for_register_count(ctx->max_reg).max_threads;
 
-   return asprintf(out,
-                   "%s shader: %u inst, %u bytes, %u regs, %u uniforms, "
-                   "%u scratch, %u threads, %u loops, %u:%u spills:fills",
-                   gl_shader_stage_name(ctx->stage), nr_ins, size, ctx->max_reg,
-                   ctx->out->push_count, ctx->scratch_size, nr_threads,
-                   ctx->loop_count, spills, fills);
+   return asprintf(
+      out,
+      "%s shader: %u inst, %u alu, %u fscib, %u ic, %u bytes, %u regs, "
+      "%u uniforms, %u scratch, %u threads, %u loops, "
+      "%u:%u spills:fills",
+      gl_shader_stage_name(ctx->stage), nr_ins, cycles.alu, cycles.f_scib,
+      cycles.ic, size, ctx->max_reg, ctx->out->push_count, ctx->scratch_size,
+      nr_threads, ctx->loop_count, spills, fills);
 }
 
 static bool
diff --git a/src/asahi/compiler/agx_compiler.h b/src/asahi/compiler/agx_compiler.h
index fc0efc3aacd..667bef12a35 100644
--- a/src/asahi/compiler/agx_compiler.h
+++ b/src/asahi/compiler/agx_compiler.h
@@ -1004,6 +1004,21 @@ bool agx_nir_lower_ubo(nir_shader *shader);
 bool agx_nir_lower_shared_bitsize(nir_shader *shader);
 bool agx_nir_lower_frag_sidefx(nir_shader *s);
 
+struct agx_cycle_estimate {
+   /* ALU throughput */
+   unsigned alu;
+
+   /* Floating point and SCIB (select, conditional, integer, and boolean)
+    * throughput.
+    */
+   unsigned f_scib;
+
+   /* IC (Integer and complex) throughput */
+   unsigned ic;
+};
+
+struct agx_cycle_estimate agx_estimate_cycles(agx_context *ctx);
+
 extern int agx_compiler_debug;
 
 #ifdef __cplusplus
diff --git a/src/asahi/compiler/agx_performance.c b/src/asahi/compiler/agx_performance.c
index f9f6e49b261..7d76fab6c6c 100644
--- a/src/asahi/compiler/agx_performance.c
+++ b/src/asahi/compiler/agx_performance.c
@@ -5,6 +5,7 @@
 
 #include "agx_compile.h"
 #include "agx_compiler.h"
+#include "agx_opcodes.h"
 
 /* Table describing the relationship between registers pressure and thread
  * count. Each entry describes a maximum number of registers and the associated
@@ -46,3 +47,85 @@ agx_max_registers_for_occupancy(unsigned occupancy)
    assert(max_regs > 0 && "Thread count must be less than the maximum");
    return max_regs;
 }
+
+/* Crude cycle model for G13G */
+enum alu_unit {
+   NONE,
+   SCIB,
+   IC,
+   F32,
+   F16,
+};
+
+struct alu_timing {
+   enum alu_unit unit;
+   unsigned latency;
+   unsigned tp;
+};
+
+/* clang-format off */
+struct alu_timing op_timings[] = {
+   [AGX_OPCODE_FMA]           = { F32, 2, 1 },
+   [AGX_OPCODE_FADD]          = { F32, 2, 1 },
+   [AGX_OPCODE_FMUL]          = { F32, 2, 1 },
+
+   [AGX_OPCODE_MOV_IMM]       = { SCIB, 1, 1 },
+   [AGX_OPCODE_BITOP]         = { SCIB, 2, 1 }, /* tp might be 2 for 32-bit / no $? */
+   [AGX_OPCODE_ICMPSEL]       = { SCIB, 2, 1 },
+   [AGX_OPCODE_FCMPSEL]       = { SCIB, 2, 1 },
+   [AGX_OPCODE_IADD]          = { SCIB, 2, 1 },
+
+   [AGX_OPCODE_GET_SR]          = { SCIB, 2, 2 },
+   [AGX_OPCODE_GET_SR_BARRIER]  = { SCIB, 2, 2 },
+   [AGX_OPCODE_GET_SR_COVERAGE] = { SCIB, 2, 2 },
+
+   [AGX_OPCODE_IMAD]          = { IC, 3, 2 },
+   [AGX_OPCODE_BFI]           = { IC, 3, 2 },
+   [AGX_OPCODE_EXTR]          = { IC, 3, 2 },
+   [AGX_OPCODE_ASR]           = { IC, 3, 2 },
+   [AGX_OPCODE_FLOOR]         = { IC, 3, 2 },
+   [AGX_OPCODE_SIN_PT_1]      = { IC, 3, 2 },
+   [AGX_OPCODE_SIN_PT_2]      = { IC, 5, 2 },
+   [AGX_OPCODE_LOG2]          = { IC, 5, 2 },
+   [AGX_OPCODE_EXP2]          = { IC, 5, 2 },
+   [AGX_OPCODE_RCP]           = { IC, 5, 3 },
+   [AGX_OPCODE_RSQRT]         = { IC, 6, 4 },
+   [AGX_OPCODE_SRSQRT]        = { IC, 6, 4 },
+
+   [AGX_OPCODE_SIMD_PREFIX_IADD] = { SCIB, 18, 18 },
+   [AGX_OPCODE_SIMD_IADD]        = { SCIB, 24, 24 },
+   [AGX_OPCODE_SIMD_SHUFFLE]     = { SCIB, 5, 2   },
+
+   [AGX_OPCODE_ICMP_BALLOT]      = { SCIB, 5, 2   },
+   [AGX_OPCODE_FCMP_BALLOT]      = { SCIB, 5, 2   },
+   [AGX_OPCODE_ICMP_QUAD_BALLOT] = { SCIB, 4, 2   },
+   [AGX_OPCODE_FCMP_QUAD_BALLOT] = { SCIB, 4, 2   },
+};
+/* clang-format on */
+
+/*
+ * TODO: Model non-ALU instructions, latency, register cache, 64-bit, etc.
+ */
+struct agx_cycle_estimate
+agx_estimate_cycles(agx_context *ctx)
+{
+   struct agx_cycle_estimate est = {0};
+
+   agx_foreach_instr_global(ctx, I) {
+      struct alu_timing alu = I->op < ARRAY_SIZE(op_timings)
+                                 ? op_timings[I->op]
+                                 : (struct alu_timing){0};
+
+      if (alu.unit == IC) {
+         est.ic += alu.tp * 2;
+      } else if (alu.unit) {
+         est.f_scib += alu.tp;
+      } else {
+         /* TODO */
+      }
+   }
+
+   /* IC and F/SCIB run in parallel across warps */
+   est.alu = MAX2(est.ic, est.f_scib);
+   return est;
+}