agx: start a crude cycle model

based on notes by Dougall Johnson and Philip Turner.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28483>
This commit is contained in:
Alyssa Rosenzweig
2024-03-05 15:40:58 -04:00
committed by Marge Bot
parent 85f7310ba7
commit b9a359e9bd
3 changed files with 108 additions and 6 deletions
+10 -6
View File
@@ -2319,15 +2319,19 @@ agx_dump_stats(agx_context *ctx, unsigned size, char **out)
fills++;
}
struct agx_cycle_estimate cycles = agx_estimate_cycles(ctx);
unsigned nr_threads =
agx_occupancy_for_register_count(ctx->max_reg).max_threads;
return asprintf(out,
"%s shader: %u inst, %u bytes, %u regs, %u uniforms, "
"%u scratch, %u threads, %u loops, %u:%u spills:fills",
gl_shader_stage_name(ctx->stage), nr_ins, size, ctx->max_reg,
ctx->out->push_count, ctx->scratch_size, nr_threads,
ctx->loop_count, spills, fills);
return asprintf(
out,
"%s shader: %u inst, %u alu, %u fscib, %u ic, %u bytes, %u regs, "
"%u uniforms, %u scratch, %u threads, %u loops, "
"%u:%u spills:fills",
gl_shader_stage_name(ctx->stage), nr_ins, cycles.alu, cycles.f_scib,
cycles.ic, size, ctx->max_reg, ctx->out->push_count, ctx->scratch_size,
nr_threads, ctx->loop_count, spills, fills);
}
static bool
+15
View File
@@ -1004,6 +1004,21 @@ bool agx_nir_lower_ubo(nir_shader *shader);
bool agx_nir_lower_shared_bitsize(nir_shader *shader);
bool agx_nir_lower_frag_sidefx(nir_shader *s);
struct agx_cycle_estimate {
/* ALU throughput */
unsigned alu;
/* Floating point and SCIB (select, conditional, integer, and boolean)
* throughput.
*/
unsigned f_scib;
/* IC (Integer and complex) throughput */
unsigned ic;
};
struct agx_cycle_estimate agx_estimate_cycles(agx_context *ctx);
extern int agx_compiler_debug;
#ifdef __cplusplus
+83
View File
@@ -5,6 +5,7 @@
#include "agx_compile.h"
#include "agx_compiler.h"
#include "agx_opcodes.h"
/* Table describing the relationship between registers pressure and thread
* count. Each entry describes a maximum number of registers and the associated
@@ -46,3 +47,85 @@ agx_max_registers_for_occupancy(unsigned occupancy)
assert(max_regs > 0 && "Thread count must be less than the maximum");
return max_regs;
}
/* Crude cycle model for G13G */
enum alu_unit {
NONE,
SCIB,
IC,
F32,
F16,
};
struct alu_timing {
enum alu_unit unit;
unsigned latency;
unsigned tp;
};
/* clang-format off */
struct alu_timing op_timings[] = {
[AGX_OPCODE_FMA] = { F32, 2, 1 },
[AGX_OPCODE_FADD] = { F32, 2, 1 },
[AGX_OPCODE_FMUL] = { F32, 2, 1 },
[AGX_OPCODE_MOV_IMM] = { SCIB, 1, 1 },
[AGX_OPCODE_BITOP] = { SCIB, 2, 1 }, /* tp might be 2 for 32-bit / no $? */
[AGX_OPCODE_ICMPSEL] = { SCIB, 2, 1 },
[AGX_OPCODE_FCMPSEL] = { SCIB, 2, 1 },
[AGX_OPCODE_IADD] = { SCIB, 2, 1 },
[AGX_OPCODE_GET_SR] = { SCIB, 2, 2 },
[AGX_OPCODE_GET_SR_BARRIER] = { SCIB, 2, 2 },
[AGX_OPCODE_GET_SR_COVERAGE] = { SCIB, 2, 2 },
[AGX_OPCODE_IMAD] = { IC, 3, 2 },
[AGX_OPCODE_BFI] = { IC, 3, 2 },
[AGX_OPCODE_EXTR] = { IC, 3, 2 },
[AGX_OPCODE_ASR] = { IC, 3, 2 },
[AGX_OPCODE_FLOOR] = { IC, 3, 2 },
[AGX_OPCODE_SIN_PT_1] = { IC, 3, 2 },
[AGX_OPCODE_SIN_PT_2] = { IC, 5, 2 },
[AGX_OPCODE_LOG2] = { IC, 5, 2 },
[AGX_OPCODE_EXP2] = { IC, 5, 2 },
[AGX_OPCODE_RCP] = { IC, 5, 3 },
[AGX_OPCODE_RSQRT] = { IC, 6, 4 },
[AGX_OPCODE_SRSQRT] = { IC, 6, 4 },
[AGX_OPCODE_SIMD_PREFIX_IADD] = { SCIB, 18, 18 },
[AGX_OPCODE_SIMD_IADD] = { SCIB, 24, 24 },
[AGX_OPCODE_SIMD_SHUFFLE] = { SCIB, 5, 2 },
[AGX_OPCODE_ICMP_BALLOT] = { SCIB, 5, 2 },
[AGX_OPCODE_FCMP_BALLOT] = { SCIB, 5, 2 },
[AGX_OPCODE_ICMP_QUAD_BALLOT] = { SCIB, 4, 2 },
[AGX_OPCODE_FCMP_QUAD_BALLOT] = { SCIB, 4, 2 },
};
/* clang-format on */
/*
* TODO: Model non-ALU instructions, latency, register cache, 64-bit, etc.
*/
struct agx_cycle_estimate
agx_estimate_cycles(agx_context *ctx)
{
struct agx_cycle_estimate est = {0};
agx_foreach_instr_global(ctx, I) {
struct alu_timing alu = I->op < ARRAY_SIZE(op_timings)
? op_timings[I->op]
: (struct alu_timing){0};
if (alu.unit == IC) {
est.ic += alu.tp * 2;
} else if (alu.unit) {
est.f_scib += alu.tp;
} else {
/* TODO */
}
}
/* IC and F/SCIB run in parallel across warps */
est.alu = MAX2(est.ic, est.f_scib);
return est;
}