From 07d9fc574cc2250351f014ee679849a86f592817 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 16 Dec 2025 14:56:24 +0100 Subject: [PATCH] ac/spm: implement the new derived SPM chunk for performance counters This is the new method to add performance counters to RGP captures. This will be used to add the new RGP 2.6 counters too. The previous SPM code will be deprecated at some point but it's hard to support all generations in one batch. So, I will implement this step by step. Signed-off-by: Samuel Pitoiset Part-of: --- src/amd/common/ac_rgp.c | 222 +++++++++++++++++- src/amd/common/ac_spm.c | 501 ++++++++++++++++++++++++++++++++++++++++ src/amd/common/ac_spm.h | 110 +++++++++ 3 files changed, 830 insertions(+), 3 deletions(-) diff --git a/src/amd/common/ac_rgp.c b/src/amd/common/ac_rgp.c index 1d03dfeb88e..b3db17851f1 100644 --- a/src/amd/common/ac_rgp.c +++ b/src/amd/common/ac_rgp.c @@ -58,6 +58,10 @@ enum sqtt_file_chunk_type SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS, SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION, SQTT_FILE_CHUNK_TYPE_INSTRUMENTATION_TABLE, + + SQTT_FILE_CHUNK_TYPE_FIRST_TOOLS_TYPE = 128, + SQTT_FILE_CHUNK_TYPE_DERIVED_SPM_DB = SQTT_FILE_CHUNK_TYPE_FIRST_TOOLS_TYPE, + SQTT_FILE_CHUNK_TYPE_COUNT }; @@ -992,10 +996,203 @@ static void ac_sqtt_dump_spm(const struct ac_spm_trace *spm_trace, fseek(output, file_offset, SEEK_SET); } +/** + * SQTT Derived SPM DB info. + */ +struct sqtt_derived_spm_group_info { + uint32_t size_in_bytes; + uint32_t offset; + uint32_t group_name_length; + uint32_t group_description_length; + uint32_t num_counters; +}; + +struct sqtt_derived_spm_counter_info { + uint32_t size_in_bytes; + uint32_t offset; + uint32_t counter_name_length; + uint32_t counter_description_length; + uint32_t num_components; + uint8_t usage_type; +}; + +struct sqtt_derived_spm_component_info { + uint32_t size_in_bytes; + uint32_t offset; + uint32_t component_name_length; + uint32_t component_description_length; + uint32_t usage_type; +}; + +struct sqtt_file_chunk_derived_spm_db { + struct sqtt_file_chunk_header header; + uint32_t offset; + uint32_t flags; + uint32_t num_timestamps; + uint32_t num_groups; + uint32_t num_counters; + uint32_t num_components; + uint32_t sampling_interval; +}; + +static_assert(sizeof(struct sqtt_file_chunk_derived_spm_db) == 44, + "sqtt_file_chunk_derived_spm_db doesn't match RGP spec"); + +static void ac_sqtt_fill_derived_spm_db(const struct ac_spm_derived_trace *spm_derived_trace, + struct sqtt_file_chunk_derived_spm_db *chunk, + size_t file_offset, + uint32_t chunk_size) +{ + chunk->header.chunk_id.type = SQTT_FILE_CHUNK_TYPE_DERIVED_SPM_DB; + chunk->header.chunk_id.index = 0; + chunk->header.major_version = 0; + chunk->header.minor_version = 0; + chunk->header.size_in_bytes = chunk_size; + + chunk->offset = sizeof(*chunk); + chunk->flags = 0; + chunk->num_timestamps = spm_derived_trace->num_timestamps; + chunk->num_groups = spm_derived_trace->num_groups; + chunk->num_counters = spm_derived_trace->num_counters; + chunk->num_components = spm_derived_trace->num_components; + chunk->sampling_interval = spm_derived_trace->sample_interval; +} + +static void ac_sqtt_dump_derived_spm(const struct ac_spm_derived_trace *spm_derived_trace, + size_t file_offset, + FILE *output) +{ + struct sqtt_file_chunk_derived_spm_db derived_spm_db; + size_t file_derived_spm_db_offset = file_offset; + + fseek(output, sizeof(struct sqtt_file_chunk_derived_spm_db), SEEK_CUR); + file_offset += sizeof(struct sqtt_file_chunk_derived_spm_db); + + /* Dump timestamps. */ + for (uint32_t i = 0; i < spm_derived_trace->num_timestamps; i++) { + uint64_t timestamp = spm_derived_trace->timestamps[i]; + + file_offset += sizeof(timestamp); + fwrite(×tamp, sizeof(timestamp), 1, output); + } + + /* Dump SPM groups. */ + for (uint32_t i = 0; i < spm_derived_trace->num_groups; i++) { + const struct ac_spm_derived_group *group = &spm_derived_trace->groups[i]; + const struct ac_spm_derived_group_descr *group_descr = group->descr; + struct sqtt_derived_spm_group_info group_info = {0}; + + const uint32_t num_counters = group_descr->num_counters; + const uint32_t name_length = strlen(group_descr->name); + + group_info.size_in_bytes = sizeof(group_info) + name_length + + num_counters * sizeof(uint32_t); + group_info.offset = sizeof(group_info); + group_info.group_name_length = name_length; + group_info.num_counters = num_counters; + + file_offset += sizeof(group_info) + group_info.group_name_length; + fwrite(&group_info, sizeof(group_info), 1, output); + fwrite(group_descr->name, group_info.group_name_length, 1, output); + + for (uint32_t j = 0; j < group_descr->num_counters; j++) { + const struct ac_spm_derived_counter_descr *counter_descr = group_descr->counters[j]; + uint32_t counter_id = counter_descr->id; + + file_offset += sizeof(uint32_t); + fwrite(&counter_id, sizeof(uint32_t), 1, output); + } + } + + /* Dump SPM counters. */ + for (uint32_t i = 0; i < spm_derived_trace->num_counters; i++) { + const struct ac_spm_derived_counter *counter = &spm_derived_trace->counters[i]; + const struct ac_spm_derived_counter_descr *counter_descr = counter->descr; + struct sqtt_derived_spm_counter_info counter_info = {0}; + + const uint32_t num_components = counter_descr->num_components; + const uint32_t name_length = strlen(counter_descr->name); + const uint32_t description_length = strlen(counter_descr->desc); + + counter_info.size_in_bytes = sizeof(counter_info) + name_length + + description_length + num_components * sizeof(uint32_t); + counter_info.offset = sizeof(counter_info); + counter_info.counter_name_length = name_length; + counter_info.counter_description_length = description_length; + counter_info.num_components = num_components; + counter_info.usage_type = counter_descr->usage; + + file_offset += sizeof(counter_info) + counter_info.counter_name_length + + counter_info.counter_description_length; + fwrite(&counter_info, sizeof(counter_info), 1, output); + fwrite(counter_descr->name, counter_info.counter_name_length, 1, output); + fwrite(counter_descr->desc, counter_info.counter_description_length, 1, output); + + for (uint32_t j = 0; j < counter_descr->num_components; j++) { + const struct ac_spm_derived_component_descr *component_descr = counter_descr->components[j]; + uint32_t component_id = component_descr->id; + + file_offset += sizeof(uint32_t); + fwrite(&component_id, sizeof(uint32_t), 1, output); + } + } + + /* Dump SPM components. */ + for (uint32_t i = 0; i < spm_derived_trace->num_components; i++) { + const struct ac_spm_derived_component *component = &spm_derived_trace->components[i]; + const struct ac_spm_derived_component_descr *component_descr = component->descr; + struct sqtt_derived_spm_component_info component_info = {0}; + + const uint32_t name_length = strlen(component_descr->name); + + component_info.size_in_bytes = sizeof(component_info) + name_length; + component_info.offset = sizeof(component_info); + component_info.component_name_length = name_length; + component_info.usage_type = component_descr->usage; + + file_offset += sizeof(component_info) + component_info.component_name_length + + component_info.component_description_length; + fwrite(&component_info, sizeof(component_info), 1, output); + fwrite(component_descr->name, component_info.component_name_length, 1, output); + } + + /* Dump counter values. */ + for (uint32_t i = 0; i < spm_derived_trace->num_counters; i++) { + const struct ac_spm_derived_counter *counter = &spm_derived_trace->counters[i]; + + assert(util_dynarray_num_elements(&counter->values, double) == spm_derived_trace->num_timestamps); + util_dynarray_foreach(&counter->values, double, value) { + file_offset += sizeof(double); + fwrite(value, sizeof(double), 1, output); + } + } + + /* Dump component values. */ + for (uint32_t i = 0; i < spm_derived_trace->num_components; i++) { + const struct ac_spm_derived_component *component = &spm_derived_trace->components[i]; + + assert(util_dynarray_num_elements(&component->values, double) == spm_derived_trace->num_timestamps); + util_dynarray_foreach(&component->values, double, value) { + file_offset += sizeof(double); + fwrite(value, sizeof(double), 1, output); + } + } + + /* SQTT Derived SPM chunk. */ + ac_sqtt_fill_derived_spm_db(spm_derived_trace, &derived_spm_db, + file_derived_spm_db_offset, + file_offset - file_derived_spm_db_offset); + fseek(output, file_derived_spm_db_offset, SEEK_SET); + fwrite(&derived_spm_db, sizeof(struct sqtt_file_chunk_derived_spm_db), 1, output); + fseek(output, file_offset, SEEK_SET); +} + #if defined(USE_LIBELF) static void ac_sqtt_dump_data(const struct radeon_info *rad_info, struct ac_sqtt_trace *sqtt_trace, - const struct ac_spm_trace *spm_trace, FILE *output) + const struct ac_spm_trace *spm_trace, + const struct ac_spm_derived_trace *spm_derived_trace, + FILE *output) { struct sqtt_file_chunk_asic_info asic_info = {0}; struct sqtt_file_chunk_cpu_info cpu_info = {0}; @@ -1193,12 +1390,25 @@ ac_sqtt_dump_data(const struct radeon_info *rad_info, struct ac_sqtt_trace *sqtt } } - if (spm_trace) { + if (spm_derived_trace) { + ac_sqtt_dump_derived_spm(spm_derived_trace, file_offset, output); + } else if (spm_trace) { ac_sqtt_dump_spm(spm_trace, file_offset, output); } } #endif +static bool +ac_use_derived_spm_trace(const struct radeon_info *info, + const struct ac_spm_trace *spm_trace) +{ + if (!spm_trace) + return false; + + /* TODO: Enable for GPUs. */ + return false; +} + int ac_dump_rgp_capture(const struct radeon_info *info, struct ac_sqtt_trace *sqtt_trace, const struct ac_spm_trace *spm_trace) @@ -1223,7 +1433,13 @@ ac_dump_rgp_capture(const struct radeon_info *info, struct ac_sqtt_trace *sqtt_t if (!f) return -1; - ac_sqtt_dump_data(info, sqtt_trace, spm_trace, f); + struct ac_spm_derived_trace *spm_derived_trace = + ac_use_derived_spm_trace(info, spm_trace) ? ac_spm_get_derived_trace(info, spm_trace) : NULL; + + ac_sqtt_dump_data(info, sqtt_trace, spm_trace, spm_derived_trace, f); + + if (spm_derived_trace) + ac_spm_destroy_derived_trace(spm_derived_trace); fprintf(stderr, "RGP capture saved to '%s'\n", filename); diff --git a/src/amd/common/ac_spm.c b/src/amd/common/ac_spm.c index e05097ef778..7115fddcff4 100644 --- a/src/amd/common/ac_spm.c +++ b/src/amd/common/ac_spm.c @@ -727,6 +727,507 @@ bool ac_spm_get_trace(const struct ac_spm *spm, struct ac_spm_trace *trace) return ac_spm_get_num_samples(spm, &trace->num_samples); } +/* SPM components. */ +/* Instruction cache components. */ +static struct ac_spm_derived_component_descr gfx10_inst_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_INST_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_INST_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_inst_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_INST_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_INST_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_inst_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_INST_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_INST_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* Scalar cache components. */ +static struct ac_spm_derived_component_descr gfx10_scalar_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_SCALAR_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_SCALAR_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_scalar_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_SCALAR_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_SCALAR_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_scalar_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_SCALAR_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_SCALAR_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* L0 cache components. */ +static struct ac_spm_derived_component_descr gfx10_l0_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_L0_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_L0_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l0_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_L0_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_L0_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l0_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_L0_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_L0_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* L1 cache components. */ +static struct ac_spm_derived_component_descr gfx10_l1_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_L1_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_L1_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l1_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_L1_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_L1_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l1_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_L1_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_L1_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* L2 cache components. */ +static struct ac_spm_derived_component_descr gfx10_l2_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_L2_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_L2_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l2_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_L2_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_L2_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l2_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_L2_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_L2_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* SPM counters. */ +static struct ac_spm_derived_counter_descr gfx10_inst_cache_hit_counter = { + .id = AC_SPM_COUNTER_INST_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "Instruction cache hit", + .desc = "The percentage of read requests made that hit the data in the " + "Instruction cache. The Instruction cache supplies shader code to an " + "executing shader. Each request is 64 bytes in size. Value range: 0% " + "(no hit) to 100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_inst_cache_request_count_comp, + &gfx10_inst_cache_hit_count_comp, + &gfx10_inst_cache_miss_count_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_scalar_cache_hit_counter = { + .id = AC_SPM_COUNTER_SCALAR_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "Scalar cache hit", + .desc = "The percentage of read requests made from executing shader code " + "that hit the data in the Scalar cache. The Scalar cache contains data " + "that does not vary in each thread across the wavefront. Each request is " + "64 bytes in size. Value range: 0% (no hit) to 100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_scalar_cache_request_count_comp, + &gfx10_scalar_cache_hit_count_comp, + &gfx10_scalar_cache_miss_count_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_l0_cache_hit_counter = { + .id = AC_SPM_COUNTER_L0_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "L0 cache hit", + .desc = "The percentage of read requests that hit the data in the L0 cache. " + "The L0 cache contains vector data, which is data that may vary in each " + "thread across the wavefront. Each request is 128 bytes in size. Value " + "range: 0% (no hit) to 100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_l0_cache_request_count_comp, + &gfx10_l0_cache_hit_count_comp, + &gfx10_l0_cache_miss_count_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_l1_cache_hit_counter = { + .id = AC_SPM_COUNTER_L1_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "L1 cache hit", + .desc = "The percentage of read or write requests that hit the data in the " + "L1 cache. The L1 cache is shared across all WGPs in a single shader " + "engine. Each request is 128 bytes in size. Value range: 0% (no hit) to " + "100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_l1_cache_request_count_comp, + &gfx10_l1_cache_hit_count_comp, + &gfx10_l1_cache_miss_count_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_l2_cache_hit_counter = { + .id = AC_SPM_COUNTER_L2_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "L2 cache hit", + .desc = "The percentage of read or write requests that hit the data in the " + "L2 cache. The L2 cache is shared by many blocks across the GPU, " + "including the Command Processor, Geometry Engine, all WGPs, all Render " + "Backends, and others. Each request is 128 bytes in size. Value range: 0% " + "(no hit) to 100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_l2_cache_request_count_comp, + &gfx10_l2_cache_hit_count_comp, + &gfx10_l2_cache_miss_count_comp, + }, +}; + +/* SPM groups. */ +static struct ac_spm_derived_group_descr gfx10_cache_group = { + .id = AC_SPM_GROUP_CACHE, + .name = "Cache", + .num_counters = 5, + .counters = { + &gfx10_inst_cache_hit_counter, + &gfx10_scalar_cache_hit_counter, + &gfx10_l0_cache_hit_counter, + &gfx10_l1_cache_hit_counter, + &gfx10_l2_cache_hit_counter, + }, +}; + +static struct ac_spm_derived_counter * +ac_spm_get_counter_by_id(struct ac_spm_derived_trace *spm_derived_trace, + enum ac_spm_counter_id counter_id) +{ + for (uint32_t i = 0; i < spm_derived_trace->num_counters; i++) { + struct ac_spm_derived_counter *counter = &spm_derived_trace->counters[i]; + + if (counter->descr->id == counter_id) + return counter; + } + + return NULL; +} + +static struct ac_spm_derived_component * +ac_spm_get_component_by_id(struct ac_spm_derived_trace *spm_derived_trace, + enum ac_spm_component_id component_id) +{ + for (uint32_t i = 0; i < spm_derived_trace->num_components; i++) { + struct ac_spm_derived_component *component = &spm_derived_trace->components[i]; + + if (component->descr->id == component_id) + return component; + } + + return NULL; +} + +static void +ac_spm_add_group(struct ac_spm_derived_trace *spm_derived_trace, + const struct ac_spm_derived_group_descr *group_descr) +{ + for (uint32_t i = 0; i < group_descr->num_counters; i++) { + const struct ac_spm_derived_counter_descr *counter_descr = + group_descr->counters[i]; + + for (uint32_t j = 0; j < counter_descr->num_components; j++) { + struct ac_spm_derived_component *component = + &spm_derived_trace->components[spm_derived_trace->num_components++]; + assert(spm_derived_trace->num_components <= AC_SPM_COMPONENT_COUNT); + + component->descr = counter_descr->components[j]; + } + + struct ac_spm_derived_counter *counter = + &spm_derived_trace->counters[spm_derived_trace->num_counters++]; + assert(spm_derived_trace->num_counters <= AC_SPM_COUNTER_COUNT); + counter->descr = counter_descr; + } + + struct ac_spm_derived_group *group = + &spm_derived_trace->groups[spm_derived_trace->num_groups++]; + assert(spm_derived_trace->num_groups <= AC_SPM_GROUP_COUNT); + group->descr = group_descr; +} + +static enum ac_spm_raw_counter_op +ac_spm_get_raw_counter_op(enum ac_spm_raw_counter_id id) +{ + switch (id) { + case AC_SPM_TCP_PERF_SEL_REQ: + case AC_SPM_TCP_PERF_SEL_REQ_MISS: + case AC_SPM_SQC_PERF_SEL_DCACHE_HITS: + case AC_SPM_SQC_PERF_SEL_DCACHE_MISSES: + case AC_SPM_SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE: + case AC_SPM_SQC_PERF_SEL_ICACHE_HITS: + case AC_SPM_SQC_PERF_SEL_ICACHE_MISSES: + case AC_SPM_SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE: + case AC_SPM_GL1C_PERF_SEL_REQ: + case AC_SPM_GL1C_PERF_SEL_REQ_MISS: + case AC_SPM_GL2C_PERF_SEL_REQ: + case AC_SPM_GL2C_PERF_SEL_MISS: + return AC_SPM_RAW_COUNTER_OP_SUM; + default: + UNREACHABLE("Invalid SPM raw counter ID."); + } +} + +struct ac_spm_derived_trace * +ac_spm_get_derived_trace(const struct radeon_info *info, + const struct ac_spm_trace *spm_trace) +{ + uint32_t sample_size_in_bytes = spm_trace->sample_size_in_bytes; + uint8_t *spm_data_ptr = (uint8_t *)spm_trace->ptr; + struct ac_spm_derived_trace *spm_derived_trace; + + spm_derived_trace = calloc(1, sizeof(*spm_derived_trace)); + if (!spm_derived_trace) + return NULL; + + /* Add groups to the trace. */ + ac_spm_add_group(spm_derived_trace, &gfx10_cache_group); + + spm_derived_trace->timestamps = malloc(spm_trace->num_samples * sizeof(uint64_t)); + if (!spm_derived_trace->timestamps) { + free(spm_derived_trace); + return NULL; + } + + /* Skip the reserved 32 bytes of data at beginning. */ + spm_data_ptr += 32; + + /* Collect timestamps. */ + uint64_t sample_size_in_qwords = sample_size_in_bytes / sizeof(uint64_t); + uint64_t *timestamp_ptr = (uint64_t *)spm_data_ptr; + + for (uint32_t i = 0; i < spm_trace->num_samples; i++) { + uint64_t index = i * sample_size_in_qwords; + uint64_t timestamp = timestamp_ptr[index]; + + spm_derived_trace->timestamps[i] = timestamp; + } + + /* Collect raw counter values. */ + uint64_t *raw_counter_values[AC_SPM_RAW_COUNTER_ID_COUNT]; + for (uint32_t i = 0; i < AC_SPM_RAW_COUNTER_ID_COUNT; i++) { + raw_counter_values[i] = calloc(spm_trace->num_samples, sizeof(uint64_t)); + } + + const uint32_t sample_size_in_hwords = sample_size_in_bytes / sizeof(uint16_t); + const uint16_t *counter_values_ptr = (uint16_t *)spm_data_ptr; + + for (uint32_t c = 0; c < spm_trace->num_counters; c++) { + const uint64_t offset = spm_trace->counters[c].offset; + const uint32_t id = spm_trace->counters[c].id; + const enum ac_spm_raw_counter_op op = ac_spm_get_raw_counter_op(id); + + for (uint32_t s = 0; s < spm_trace->num_samples; s++) { + const uint64_t index = offset + (s * sample_size_in_hwords); + const uint16_t value = counter_values_ptr[index]; + + switch (op) { + case AC_SPM_RAW_COUNTER_OP_SUM: + raw_counter_values[id][s] += value; + break; + default: + UNREACHABLE("Invalid SPM raw counter OP.\n"); + } + } + } + +#define GET_COMPONENT(n) \ + struct ac_spm_derived_component *_##n = \ + ac_spm_get_component_by_id(spm_derived_trace, AC_SPM_COMPONENT_##n); +#define GET_COUNTER(n) \ + struct ac_spm_derived_counter *_##n = \ + ac_spm_get_counter_by_id(spm_derived_trace, AC_SPM_COUNTER_##n); + + GET_COUNTER(INST_CACHE_HIT); + GET_COUNTER(SCALAR_CACHE_HIT); + GET_COUNTER(L0_CACHE_HIT); + GET_COUNTER(L1_CACHE_HIT); + GET_COUNTER(L2_CACHE_HIT); + + GET_COMPONENT(INST_CACHE_REQUEST_COUNT); + GET_COMPONENT(INST_CACHE_HIT_COUNT); + GET_COMPONENT(INST_CACHE_MISS_COUNT); + GET_COMPONENT(SCALAR_CACHE_REQUEST_COUNT); + GET_COMPONENT(SCALAR_CACHE_HIT_COUNT); + GET_COMPONENT(SCALAR_CACHE_MISS_COUNT); + GET_COMPONENT(L0_CACHE_REQUEST_COUNT); + GET_COMPONENT(L0_CACHE_HIT_COUNT); + GET_COMPONENT(L0_CACHE_MISS_COUNT); + GET_COMPONENT(L1_CACHE_REQUEST_COUNT); + GET_COMPONENT(L1_CACHE_HIT_COUNT); + GET_COMPONENT(L1_CACHE_MISS_COUNT); + GET_COMPONENT(L2_CACHE_REQUEST_COUNT); + GET_COMPONENT(L2_CACHE_HIT_COUNT); + GET_COMPONENT(L2_CACHE_MISS_COUNT); + +#undef GET_COMPONENT +#undef GET_COUNTER + +#define ADD(id, value) \ + util_dynarray_append(&_##id->values, (double)(value)); + +#define OP_RAW(n) \ + raw_counter_values[AC_SPM_##n][s] +#define OP_SUM2(a, b) \ + raw_counter_values[AC_SPM_##a][s] + \ + raw_counter_values[AC_SPM_##b][s] +#define OP_SUM3(a, b, c) \ + raw_counter_values[AC_SPM_##a][s] + \ + raw_counter_values[AC_SPM_##b][s] + \ + raw_counter_values[AC_SPM_##c][s] +#define OP_SUB2(a, b) \ + raw_counter_values[AC_SPM_##a][s] - \ + raw_counter_values[AC_SPM_##b][s] + + for (uint32_t s = 0; s < spm_trace->num_samples; s++) { + /* Cache group. */ + /* Instruction cache. */ + const double inst_cache_request_count = + OP_SUM3(SQC_PERF_SEL_ICACHE_HITS, SQC_PERF_SEL_ICACHE_MISSES, SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE); + const double inst_cache_hit_count = + OP_RAW(SQC_PERF_SEL_ICACHE_HITS); + const double inst_cache_miss_count = + OP_SUM2(SQC_PERF_SEL_ICACHE_MISSES, SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE); + const double inst_cache_hit = + inst_cache_request_count ? (inst_cache_hit_count / inst_cache_request_count) * 100.0f : 0.0f; + + ADD(INST_CACHE_REQUEST_COUNT, inst_cache_request_count); + ADD(INST_CACHE_HIT_COUNT, inst_cache_hit_count); + ADD(INST_CACHE_MISS_COUNT, inst_cache_miss_count); + ADD(INST_CACHE_HIT, inst_cache_hit); + + /* Scalar cache. */ + const double scalar_cache_request_count = + OP_SUM3(SQC_PERF_SEL_DCACHE_HITS, SQC_PERF_SEL_DCACHE_MISSES, SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE); + const double scalar_cache_hit_count = + OP_RAW(SQC_PERF_SEL_DCACHE_HITS); + const double scalar_cache_miss_count = + OP_SUM2(SQC_PERF_SEL_DCACHE_MISSES, SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE); + const double scalar_cache_hit = + scalar_cache_request_count ? (scalar_cache_hit_count / scalar_cache_request_count) * 100.0f : 0.0f; + + ADD(SCALAR_CACHE_REQUEST_COUNT, scalar_cache_request_count); + ADD(SCALAR_CACHE_HIT_COUNT, scalar_cache_hit_count); + ADD(SCALAR_CACHE_MISS_COUNT, scalar_cache_miss_count); + ADD(SCALAR_CACHE_HIT, scalar_cache_hit); + + /* L0 cache. */ + const double l0_cache_request_count = OP_RAW(TCP_PERF_SEL_REQ); + const double l0_cache_hit_count = OP_SUB2(TCP_PERF_SEL_REQ, TCP_PERF_SEL_REQ_MISS); + const double l0_cache_miss_count = OP_RAW(TCP_PERF_SEL_REQ_MISS); + const double l0_cache_hit = + l0_cache_request_count ? (l0_cache_hit_count / l0_cache_request_count) * 100.0f : 0.0f; + + ADD(L0_CACHE_REQUEST_COUNT, l0_cache_request_count); + ADD(L0_CACHE_HIT_COUNT, l0_cache_hit_count); + ADD(L0_CACHE_MISS_COUNT, l0_cache_miss_count); + ADD(L0_CACHE_HIT, l0_cache_hit); + + /* L1 cache. */ + const double l1_cache_request_count = OP_RAW(GL1C_PERF_SEL_REQ); + const double l1_cache_hit_count = OP_SUB2(GL1C_PERF_SEL_REQ, GL1C_PERF_SEL_REQ_MISS); + const double l1_cache_miss_count = OP_RAW(GL1C_PERF_SEL_REQ_MISS); + const double l1_cache_hit = + l1_cache_request_count ? (l1_cache_hit_count / l1_cache_request_count) * 100.0f : 0.0f; + + ADD(L1_CACHE_REQUEST_COUNT, l1_cache_request_count); + ADD(L1_CACHE_HIT_COUNT, l1_cache_hit_count); + ADD(L1_CACHE_MISS_COUNT, l1_cache_miss_count); + ADD(L1_CACHE_HIT, l1_cache_hit); + + /* L2 cache. */ + const double l2_cache_request_count = OP_RAW(GL2C_PERF_SEL_REQ); + const double l2_cache_hit_count = OP_SUB2(GL2C_PERF_SEL_REQ, GL2C_PERF_SEL_MISS); + const double l2_cache_miss_count = OP_RAW(GL2C_PERF_SEL_MISS); + const double l2_cache_hit = + l2_cache_request_count ? (l2_cache_hit_count / l2_cache_request_count) * 100.0f : 0.0f; + + ADD(L2_CACHE_REQUEST_COUNT, l2_cache_request_count); + ADD(L2_CACHE_HIT_COUNT, l2_cache_hit_count); + ADD(L2_CACHE_MISS_COUNT, l2_cache_miss_count); + ADD(L2_CACHE_HIT, l2_cache_hit); + } + +#undef ADD +#undef OP_RAW +#undef OP_SUM2 +#undef OP_SUM3 +#undef OP_SUB2 + + spm_derived_trace->num_timestamps = spm_trace->num_samples; + spm_derived_trace->sample_interval = spm_trace->sample_interval; + + for (uint32_t i = 0; i < AC_SPM_RAW_COUNTER_ID_COUNT; i++) + free(raw_counter_values[i]); + + return spm_derived_trace; +} + +void +ac_spm_destroy_derived_trace(struct ac_spm_derived_trace *spm_derived_trace) +{ + for (uint32_t i = 0; i < spm_derived_trace->num_components; i++) { + struct ac_spm_derived_component *component = &spm_derived_trace->components[i]; + util_dynarray_fini(&component->values); + } + + for (uint32_t i = 0; i < spm_derived_trace->num_counters; i++) { + struct ac_spm_derived_counter *counter = &spm_derived_trace->counters[i]; + util_dynarray_fini(&counter->values); + } + + free(spm_derived_trace); +} + static void ac_emit_spm_muxsel(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, enum amd_ip_type ip_type, const struct ac_spm *spm) diff --git a/src/amd/common/ac_spm.h b/src/amd/common/ac_spm.h index f21b2c59c7e..47f0915a21b 100644 --- a/src/amd/common/ac_spm.h +++ b/src/amd/common/ac_spm.h @@ -11,6 +11,8 @@ #include "ac_perfcounter.h" +#include "util/u_dynarray.h" + struct ac_cmdbuf; #define AC_SPM_MAX_COUNTER_PER_BLOCK 16 @@ -102,6 +104,10 @@ enum ac_spm_raw_counter_id { AC_SPM_RAW_COUNTER_ID_COUNT, }; +enum ac_spm_raw_counter_op { + AC_SPM_RAW_COUNTER_OP_SUM = 0, +}; + struct ac_spm_counter_descr { enum ac_spm_raw_counter_id id; enum ac_pc_gpu_block gpu_block; @@ -212,6 +218,103 @@ struct ac_spm_trace { uint32_t num_samples; }; +enum ac_spm_group_id { + AC_SPM_GROUP_CACHE, + AC_SPM_GROUP_COUNT, +}; + +enum ac_spm_counter_id { + AC_SPM_COUNTER_INST_CACHE_HIT, + AC_SPM_COUNTER_SCALAR_CACHE_HIT, + AC_SPM_COUNTER_L0_CACHE_HIT, + AC_SPM_COUNTER_L1_CACHE_HIT, /* < GFX12 */ + AC_SPM_COUNTER_L2_CACHE_HIT, + AC_SPM_COUNTER_COUNT, +}; + +enum ac_spm_component_id { + AC_SPM_COMPONENT_INST_CACHE_REQUEST_COUNT, + AC_SPM_COMPONENT_INST_CACHE_HIT_COUNT, + AC_SPM_COMPONENT_INST_CACHE_MISS_COUNT, + AC_SPM_COMPONENT_SCALAR_CACHE_REQUEST_COUNT, + AC_SPM_COMPONENT_SCALAR_CACHE_HIT_COUNT, + AC_SPM_COMPONENT_SCALAR_CACHE_MISS_COUNT, + AC_SPM_COMPONENT_L0_CACHE_REQUEST_COUNT, + AC_SPM_COMPONENT_L0_CACHE_HIT_COUNT, + AC_SPM_COMPONENT_L0_CACHE_MISS_COUNT, + AC_SPM_COMPONENT_L1_CACHE_REQUEST_COUNT, /* < GFX12 */ + AC_SPM_COMPONENT_L1_CACHE_HIT_COUNT, /* < GFX12 */ + AC_SPM_COMPONENT_L1_CACHE_MISS_COUNT, /* < GFX12 */ + AC_SPM_COMPONENT_L2_CACHE_REQUEST_COUNT, + AC_SPM_COMPONENT_L2_CACHE_HIT_COUNT, + AC_SPM_COMPONENT_L2_CACHE_MISS_COUNT, + AC_SPM_COMPONENT_COUNT, +}; + +enum ac_spm_usage_type { + AC_SPM_USAGE_PERCENTAGE = 1, + AC_SPM_USAGE_ITEMS = 5, +}; + +#define AC_SPM_MAX_COMPONENTS_PER_COUNTER 3 +#define AC_SPM_MAX_COUNTERS_PER_GROUP 5 + +struct ac_spm_derived_component_descr { + enum ac_spm_component_id id; + enum ac_spm_counter_id counter_id; + const char *name; + enum ac_spm_usage_type usage; +}; + +struct ac_spm_derived_counter_descr { + enum ac_spm_counter_id id; + enum ac_spm_group_id group_id; + const char *name; + const char *desc; + enum ac_spm_usage_type usage; + uint32_t num_components; + struct ac_spm_derived_component_descr *components[AC_SPM_MAX_COMPONENTS_PER_COUNTER]; +}; + +struct ac_spm_derived_group_descr { + enum ac_spm_group_id id; + const char *name; + uint32_t num_counters; + struct ac_spm_derived_counter_descr *counters[AC_SPM_MAX_COUNTERS_PER_GROUP]; +}; + +struct ac_spm_derived_group { + const struct ac_spm_derived_group_descr *descr; +}; + +struct ac_spm_derived_counter { + const struct ac_spm_derived_counter_descr *descr; + + struct util_dynarray values; +}; + +struct ac_spm_derived_component { + const struct ac_spm_derived_component_descr *descr; + + struct util_dynarray values; +}; + +struct ac_spm_derived_trace { + uint32_t num_timestamps; + uint64_t *timestamps; + + uint32_t num_groups; + struct ac_spm_derived_group groups[AC_SPM_GROUP_COUNT]; + + uint32_t num_counters; + struct ac_spm_derived_counter counters[AC_SPM_COUNTER_COUNT]; + + uint32_t num_components; + struct ac_spm_derived_component components[AC_SPM_COMPONENT_COUNT]; + + uint32_t sample_interval; +}; + bool ac_init_spm(const struct radeon_info *info, const struct ac_perfcounters *pc, struct ac_spm *spm); @@ -219,6 +322,13 @@ void ac_destroy_spm(struct ac_spm *spm); bool ac_spm_get_trace(const struct ac_spm *spm, struct ac_spm_trace *trace); +struct ac_spm_derived_trace * +ac_spm_get_derived_trace(const struct radeon_info *info, + const struct ac_spm_trace *spm_trace); + +void +ac_spm_destroy_derived_trace(struct ac_spm_derived_trace *spm_derived_trace); + void ac_emit_spm_setup(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, enum amd_ip_type ip_type, const struct ac_spm *spm,