From b9a359e9bde48bbdbc336f22ad2e2b0ffe5b227f Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Tue, 5 Mar 2024 15:40:58 -0400 Subject: [PATCH] agx: start a crude cycle model based on notes by Dougall Johnson and Philip Turner. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/asahi/compiler/agx_compile.c | 16 ++++-- src/asahi/compiler/agx_compiler.h | 15 +++++ src/asahi/compiler/agx_performance.c | 83 ++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 6 deletions(-) diff --git a/src/asahi/compiler/agx_compile.c b/src/asahi/compiler/agx_compile.c index ea31fa3b4cd..8d6054733d5 100644 --- a/src/asahi/compiler/agx_compile.c +++ b/src/asahi/compiler/agx_compile.c @@ -2319,15 +2319,19 @@ agx_dump_stats(agx_context *ctx, unsigned size, char **out) fills++; } + struct agx_cycle_estimate cycles = agx_estimate_cycles(ctx); + unsigned nr_threads = agx_occupancy_for_register_count(ctx->max_reg).max_threads; - return asprintf(out, - "%s shader: %u inst, %u bytes, %u regs, %u uniforms, " - "%u scratch, %u threads, %u loops, %u:%u spills:fills", - gl_shader_stage_name(ctx->stage), nr_ins, size, ctx->max_reg, - ctx->out->push_count, ctx->scratch_size, nr_threads, - ctx->loop_count, spills, fills); + return asprintf( + out, + "%s shader: %u inst, %u alu, %u fscib, %u ic, %u bytes, %u regs, " + "%u uniforms, %u scratch, %u threads, %u loops, " + "%u:%u spills:fills", + gl_shader_stage_name(ctx->stage), nr_ins, cycles.alu, cycles.f_scib, + cycles.ic, size, ctx->max_reg, ctx->out->push_count, ctx->scratch_size, + nr_threads, ctx->loop_count, spills, fills); } static bool diff --git a/src/asahi/compiler/agx_compiler.h b/src/asahi/compiler/agx_compiler.h index fc0efc3aacd..667bef12a35 100644 --- a/src/asahi/compiler/agx_compiler.h +++ b/src/asahi/compiler/agx_compiler.h @@ -1004,6 +1004,21 @@ bool agx_nir_lower_ubo(nir_shader *shader); bool agx_nir_lower_shared_bitsize(nir_shader *shader); bool agx_nir_lower_frag_sidefx(nir_shader *s); +struct agx_cycle_estimate { + /* ALU throughput */ + unsigned alu; + + /* Floating point and SCIB (select, conditional, integer, and boolean) + * throughput. + */ + unsigned f_scib; + + /* IC (Integer and complex) throughput */ + unsigned ic; +}; + +struct agx_cycle_estimate agx_estimate_cycles(agx_context *ctx); + extern int agx_compiler_debug; #ifdef __cplusplus diff --git a/src/asahi/compiler/agx_performance.c b/src/asahi/compiler/agx_performance.c index f9f6e49b261..7d76fab6c6c 100644 --- a/src/asahi/compiler/agx_performance.c +++ b/src/asahi/compiler/agx_performance.c @@ -5,6 +5,7 @@ #include "agx_compile.h" #include "agx_compiler.h" +#include "agx_opcodes.h" /* Table describing the relationship between registers pressure and thread * count. Each entry describes a maximum number of registers and the associated @@ -46,3 +47,85 @@ agx_max_registers_for_occupancy(unsigned occupancy) assert(max_regs > 0 && "Thread count must be less than the maximum"); return max_regs; } + +/* Crude cycle model for G13G */ +enum alu_unit { + NONE, + SCIB, + IC, + F32, + F16, +}; + +struct alu_timing { + enum alu_unit unit; + unsigned latency; + unsigned tp; +}; + +/* clang-format off */ +struct alu_timing op_timings[] = { + [AGX_OPCODE_FMA] = { F32, 2, 1 }, + [AGX_OPCODE_FADD] = { F32, 2, 1 }, + [AGX_OPCODE_FMUL] = { F32, 2, 1 }, + + [AGX_OPCODE_MOV_IMM] = { SCIB, 1, 1 }, + [AGX_OPCODE_BITOP] = { SCIB, 2, 1 }, /* tp might be 2 for 32-bit / no $? */ + [AGX_OPCODE_ICMPSEL] = { SCIB, 2, 1 }, + [AGX_OPCODE_FCMPSEL] = { SCIB, 2, 1 }, + [AGX_OPCODE_IADD] = { SCIB, 2, 1 }, + + [AGX_OPCODE_GET_SR] = { SCIB, 2, 2 }, + [AGX_OPCODE_GET_SR_BARRIER] = { SCIB, 2, 2 }, + [AGX_OPCODE_GET_SR_COVERAGE] = { SCIB, 2, 2 }, + + [AGX_OPCODE_IMAD] = { IC, 3, 2 }, + [AGX_OPCODE_BFI] = { IC, 3, 2 }, + [AGX_OPCODE_EXTR] = { IC, 3, 2 }, + [AGX_OPCODE_ASR] = { IC, 3, 2 }, + [AGX_OPCODE_FLOOR] = { IC, 3, 2 }, + [AGX_OPCODE_SIN_PT_1] = { IC, 3, 2 }, + [AGX_OPCODE_SIN_PT_2] = { IC, 5, 2 }, + [AGX_OPCODE_LOG2] = { IC, 5, 2 }, + [AGX_OPCODE_EXP2] = { IC, 5, 2 }, + [AGX_OPCODE_RCP] = { IC, 5, 3 }, + [AGX_OPCODE_RSQRT] = { IC, 6, 4 }, + [AGX_OPCODE_SRSQRT] = { IC, 6, 4 }, + + [AGX_OPCODE_SIMD_PREFIX_IADD] = { SCIB, 18, 18 }, + [AGX_OPCODE_SIMD_IADD] = { SCIB, 24, 24 }, + [AGX_OPCODE_SIMD_SHUFFLE] = { SCIB, 5, 2 }, + + [AGX_OPCODE_ICMP_BALLOT] = { SCIB, 5, 2 }, + [AGX_OPCODE_FCMP_BALLOT] = { SCIB, 5, 2 }, + [AGX_OPCODE_ICMP_QUAD_BALLOT] = { SCIB, 4, 2 }, + [AGX_OPCODE_FCMP_QUAD_BALLOT] = { SCIB, 4, 2 }, +}; +/* clang-format on */ + +/* + * TODO: Model non-ALU instructions, latency, register cache, 64-bit, etc. + */ +struct agx_cycle_estimate +agx_estimate_cycles(agx_context *ctx) +{ + struct agx_cycle_estimate est = {0}; + + agx_foreach_instr_global(ctx, I) { + struct alu_timing alu = I->op < ARRAY_SIZE(op_timings) + ? op_timings[I->op] + : (struct alu_timing){0}; + + if (alu.unit == IC) { + est.ic += alu.tp * 2; + } else if (alu.unit) { + est.f_scib += alu.tp; + } else { + /* TODO */ + } + } + + /* IC and F/SCIB run in parallel across warps */ + est.alu = MAX2(est.ic, est.f_scib); + return est; +}