diff --git a/src/amd/common/ac_rgp.c b/src/amd/common/ac_rgp.c index 1d03dfeb88e..b3db17851f1 100644 --- a/src/amd/common/ac_rgp.c +++ b/src/amd/common/ac_rgp.c @@ -58,6 +58,10 @@ enum sqtt_file_chunk_type SQTT_FILE_CHUNK_TYPE_CODE_OBJECT_LOADER_EVENTS, SQTT_FILE_CHUNK_TYPE_PSO_CORRELATION, SQTT_FILE_CHUNK_TYPE_INSTRUMENTATION_TABLE, + + SQTT_FILE_CHUNK_TYPE_FIRST_TOOLS_TYPE = 128, + SQTT_FILE_CHUNK_TYPE_DERIVED_SPM_DB = SQTT_FILE_CHUNK_TYPE_FIRST_TOOLS_TYPE, + SQTT_FILE_CHUNK_TYPE_COUNT }; @@ -992,10 +996,203 @@ static void ac_sqtt_dump_spm(const struct ac_spm_trace *spm_trace, fseek(output, file_offset, SEEK_SET); } +/** + * SQTT Derived SPM DB info. + */ +struct sqtt_derived_spm_group_info { + uint32_t size_in_bytes; + uint32_t offset; + uint32_t group_name_length; + uint32_t group_description_length; + uint32_t num_counters; +}; + +struct sqtt_derived_spm_counter_info { + uint32_t size_in_bytes; + uint32_t offset; + uint32_t counter_name_length; + uint32_t counter_description_length; + uint32_t num_components; + uint8_t usage_type; +}; + +struct sqtt_derived_spm_component_info { + uint32_t size_in_bytes; + uint32_t offset; + uint32_t component_name_length; + uint32_t component_description_length; + uint32_t usage_type; +}; + +struct sqtt_file_chunk_derived_spm_db { + struct sqtt_file_chunk_header header; + uint32_t offset; + uint32_t flags; + uint32_t num_timestamps; + uint32_t num_groups; + uint32_t num_counters; + uint32_t num_components; + uint32_t sampling_interval; +}; + +static_assert(sizeof(struct sqtt_file_chunk_derived_spm_db) == 44, + "sqtt_file_chunk_derived_spm_db doesn't match RGP spec"); + +static void ac_sqtt_fill_derived_spm_db(const struct ac_spm_derived_trace *spm_derived_trace, + struct sqtt_file_chunk_derived_spm_db *chunk, + size_t file_offset, + uint32_t chunk_size) +{ + chunk->header.chunk_id.type = SQTT_FILE_CHUNK_TYPE_DERIVED_SPM_DB; + chunk->header.chunk_id.index = 0; + chunk->header.major_version = 0; + chunk->header.minor_version = 0; + chunk->header.size_in_bytes = chunk_size; + + chunk->offset = sizeof(*chunk); + chunk->flags = 0; + chunk->num_timestamps = spm_derived_trace->num_timestamps; + chunk->num_groups = spm_derived_trace->num_groups; + chunk->num_counters = spm_derived_trace->num_counters; + chunk->num_components = spm_derived_trace->num_components; + chunk->sampling_interval = spm_derived_trace->sample_interval; +} + +static void ac_sqtt_dump_derived_spm(const struct ac_spm_derived_trace *spm_derived_trace, + size_t file_offset, + FILE *output) +{ + struct sqtt_file_chunk_derived_spm_db derived_spm_db; + size_t file_derived_spm_db_offset = file_offset; + + fseek(output, sizeof(struct sqtt_file_chunk_derived_spm_db), SEEK_CUR); + file_offset += sizeof(struct sqtt_file_chunk_derived_spm_db); + + /* Dump timestamps. */ + for (uint32_t i = 0; i < spm_derived_trace->num_timestamps; i++) { + uint64_t timestamp = spm_derived_trace->timestamps[i]; + + file_offset += sizeof(timestamp); + fwrite(×tamp, sizeof(timestamp), 1, output); + } + + /* Dump SPM groups. */ + for (uint32_t i = 0; i < spm_derived_trace->num_groups; i++) { + const struct ac_spm_derived_group *group = &spm_derived_trace->groups[i]; + const struct ac_spm_derived_group_descr *group_descr = group->descr; + struct sqtt_derived_spm_group_info group_info = {0}; + + const uint32_t num_counters = group_descr->num_counters; + const uint32_t name_length = strlen(group_descr->name); + + group_info.size_in_bytes = sizeof(group_info) + name_length + + num_counters * sizeof(uint32_t); + group_info.offset = sizeof(group_info); + group_info.group_name_length = name_length; + group_info.num_counters = num_counters; + + file_offset += sizeof(group_info) + group_info.group_name_length; + fwrite(&group_info, sizeof(group_info), 1, output); + fwrite(group_descr->name, group_info.group_name_length, 1, output); + + for (uint32_t j = 0; j < group_descr->num_counters; j++) { + const struct ac_spm_derived_counter_descr *counter_descr = group_descr->counters[j]; + uint32_t counter_id = counter_descr->id; + + file_offset += sizeof(uint32_t); + fwrite(&counter_id, sizeof(uint32_t), 1, output); + } + } + + /* Dump SPM counters. */ + for (uint32_t i = 0; i < spm_derived_trace->num_counters; i++) { + const struct ac_spm_derived_counter *counter = &spm_derived_trace->counters[i]; + const struct ac_spm_derived_counter_descr *counter_descr = counter->descr; + struct sqtt_derived_spm_counter_info counter_info = {0}; + + const uint32_t num_components = counter_descr->num_components; + const uint32_t name_length = strlen(counter_descr->name); + const uint32_t description_length = strlen(counter_descr->desc); + + counter_info.size_in_bytes = sizeof(counter_info) + name_length + + description_length + num_components * sizeof(uint32_t); + counter_info.offset = sizeof(counter_info); + counter_info.counter_name_length = name_length; + counter_info.counter_description_length = description_length; + counter_info.num_components = num_components; + counter_info.usage_type = counter_descr->usage; + + file_offset += sizeof(counter_info) + counter_info.counter_name_length + + counter_info.counter_description_length; + fwrite(&counter_info, sizeof(counter_info), 1, output); + fwrite(counter_descr->name, counter_info.counter_name_length, 1, output); + fwrite(counter_descr->desc, counter_info.counter_description_length, 1, output); + + for (uint32_t j = 0; j < counter_descr->num_components; j++) { + const struct ac_spm_derived_component_descr *component_descr = counter_descr->components[j]; + uint32_t component_id = component_descr->id; + + file_offset += sizeof(uint32_t); + fwrite(&component_id, sizeof(uint32_t), 1, output); + } + } + + /* Dump SPM components. */ + for (uint32_t i = 0; i < spm_derived_trace->num_components; i++) { + const struct ac_spm_derived_component *component = &spm_derived_trace->components[i]; + const struct ac_spm_derived_component_descr *component_descr = component->descr; + struct sqtt_derived_spm_component_info component_info = {0}; + + const uint32_t name_length = strlen(component_descr->name); + + component_info.size_in_bytes = sizeof(component_info) + name_length; + component_info.offset = sizeof(component_info); + component_info.component_name_length = name_length; + component_info.usage_type = component_descr->usage; + + file_offset += sizeof(component_info) + component_info.component_name_length + + component_info.component_description_length; + fwrite(&component_info, sizeof(component_info), 1, output); + fwrite(component_descr->name, component_info.component_name_length, 1, output); + } + + /* Dump counter values. */ + for (uint32_t i = 0; i < spm_derived_trace->num_counters; i++) { + const struct ac_spm_derived_counter *counter = &spm_derived_trace->counters[i]; + + assert(util_dynarray_num_elements(&counter->values, double) == spm_derived_trace->num_timestamps); + util_dynarray_foreach(&counter->values, double, value) { + file_offset += sizeof(double); + fwrite(value, sizeof(double), 1, output); + } + } + + /* Dump component values. */ + for (uint32_t i = 0; i < spm_derived_trace->num_components; i++) { + const struct ac_spm_derived_component *component = &spm_derived_trace->components[i]; + + assert(util_dynarray_num_elements(&component->values, double) == spm_derived_trace->num_timestamps); + util_dynarray_foreach(&component->values, double, value) { + file_offset += sizeof(double); + fwrite(value, sizeof(double), 1, output); + } + } + + /* SQTT Derived SPM chunk. */ + ac_sqtt_fill_derived_spm_db(spm_derived_trace, &derived_spm_db, + file_derived_spm_db_offset, + file_offset - file_derived_spm_db_offset); + fseek(output, file_derived_spm_db_offset, SEEK_SET); + fwrite(&derived_spm_db, sizeof(struct sqtt_file_chunk_derived_spm_db), 1, output); + fseek(output, file_offset, SEEK_SET); +} + #if defined(USE_LIBELF) static void ac_sqtt_dump_data(const struct radeon_info *rad_info, struct ac_sqtt_trace *sqtt_trace, - const struct ac_spm_trace *spm_trace, FILE *output) + const struct ac_spm_trace *spm_trace, + const struct ac_spm_derived_trace *spm_derived_trace, + FILE *output) { struct sqtt_file_chunk_asic_info asic_info = {0}; struct sqtt_file_chunk_cpu_info cpu_info = {0}; @@ -1193,12 +1390,25 @@ ac_sqtt_dump_data(const struct radeon_info *rad_info, struct ac_sqtt_trace *sqtt } } - if (spm_trace) { + if (spm_derived_trace) { + ac_sqtt_dump_derived_spm(spm_derived_trace, file_offset, output); + } else if (spm_trace) { ac_sqtt_dump_spm(spm_trace, file_offset, output); } } #endif +static bool +ac_use_derived_spm_trace(const struct radeon_info *info, + const struct ac_spm_trace *spm_trace) +{ + if (!spm_trace) + return false; + + /* TODO: Enable for GPUs. */ + return false; +} + int ac_dump_rgp_capture(const struct radeon_info *info, struct ac_sqtt_trace *sqtt_trace, const struct ac_spm_trace *spm_trace) @@ -1223,7 +1433,13 @@ ac_dump_rgp_capture(const struct radeon_info *info, struct ac_sqtt_trace *sqtt_t if (!f) return -1; - ac_sqtt_dump_data(info, sqtt_trace, spm_trace, f); + struct ac_spm_derived_trace *spm_derived_trace = + ac_use_derived_spm_trace(info, spm_trace) ? ac_spm_get_derived_trace(info, spm_trace) : NULL; + + ac_sqtt_dump_data(info, sqtt_trace, spm_trace, spm_derived_trace, f); + + if (spm_derived_trace) + ac_spm_destroy_derived_trace(spm_derived_trace); fprintf(stderr, "RGP capture saved to '%s'\n", filename); diff --git a/src/amd/common/ac_spm.c b/src/amd/common/ac_spm.c index e05097ef778..7115fddcff4 100644 --- a/src/amd/common/ac_spm.c +++ b/src/amd/common/ac_spm.c @@ -727,6 +727,507 @@ bool ac_spm_get_trace(const struct ac_spm *spm, struct ac_spm_trace *trace) return ac_spm_get_num_samples(spm, &trace->num_samples); } +/* SPM components. */ +/* Instruction cache components. */ +static struct ac_spm_derived_component_descr gfx10_inst_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_INST_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_INST_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_inst_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_INST_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_INST_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_inst_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_INST_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_INST_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* Scalar cache components. */ +static struct ac_spm_derived_component_descr gfx10_scalar_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_SCALAR_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_SCALAR_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_scalar_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_SCALAR_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_SCALAR_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_scalar_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_SCALAR_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_SCALAR_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* L0 cache components. */ +static struct ac_spm_derived_component_descr gfx10_l0_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_L0_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_L0_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l0_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_L0_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_L0_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l0_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_L0_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_L0_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* L1 cache components. */ +static struct ac_spm_derived_component_descr gfx10_l1_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_L1_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_L1_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l1_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_L1_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_L1_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l1_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_L1_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_L1_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* L2 cache components. */ +static struct ac_spm_derived_component_descr gfx10_l2_cache_request_count_comp = { + .id = AC_SPM_COMPONENT_L2_CACHE_REQUEST_COUNT, + .counter_id = AC_SPM_COUNTER_L2_CACHE_HIT, + .name = "Requests", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l2_cache_hit_count_comp = { + .id = AC_SPM_COMPONENT_L2_CACHE_HIT_COUNT, + .counter_id = AC_SPM_COUNTER_L2_CACHE_HIT, + .name = "Hits", + .usage = AC_SPM_USAGE_ITEMS, +}; + +static struct ac_spm_derived_component_descr gfx10_l2_cache_miss_count_comp = { + .id = AC_SPM_COMPONENT_L2_CACHE_MISS_COUNT, + .counter_id = AC_SPM_COUNTER_L2_CACHE_HIT, + .name = "Misses", + .usage = AC_SPM_USAGE_ITEMS, +}; + +/* SPM counters. */ +static struct ac_spm_derived_counter_descr gfx10_inst_cache_hit_counter = { + .id = AC_SPM_COUNTER_INST_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "Instruction cache hit", + .desc = "The percentage of read requests made that hit the data in the " + "Instruction cache. The Instruction cache supplies shader code to an " + "executing shader. Each request is 64 bytes in size. Value range: 0% " + "(no hit) to 100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_inst_cache_request_count_comp, + &gfx10_inst_cache_hit_count_comp, + &gfx10_inst_cache_miss_count_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_scalar_cache_hit_counter = { + .id = AC_SPM_COUNTER_SCALAR_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "Scalar cache hit", + .desc = "The percentage of read requests made from executing shader code " + "that hit the data in the Scalar cache. The Scalar cache contains data " + "that does not vary in each thread across the wavefront. Each request is " + "64 bytes in size. Value range: 0% (no hit) to 100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_scalar_cache_request_count_comp, + &gfx10_scalar_cache_hit_count_comp, + &gfx10_scalar_cache_miss_count_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_l0_cache_hit_counter = { + .id = AC_SPM_COUNTER_L0_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "L0 cache hit", + .desc = "The percentage of read requests that hit the data in the L0 cache. " + "The L0 cache contains vector data, which is data that may vary in each " + "thread across the wavefront. Each request is 128 bytes in size. Value " + "range: 0% (no hit) to 100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_l0_cache_request_count_comp, + &gfx10_l0_cache_hit_count_comp, + &gfx10_l0_cache_miss_count_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_l1_cache_hit_counter = { + .id = AC_SPM_COUNTER_L1_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "L1 cache hit", + .desc = "The percentage of read or write requests that hit the data in the " + "L1 cache. The L1 cache is shared across all WGPs in a single shader " + "engine. Each request is 128 bytes in size. Value range: 0% (no hit) to " + "100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_l1_cache_request_count_comp, + &gfx10_l1_cache_hit_count_comp, + &gfx10_l1_cache_miss_count_comp, + }, +}; + +static struct ac_spm_derived_counter_descr gfx10_l2_cache_hit_counter = { + .id = AC_SPM_COUNTER_L2_CACHE_HIT, + .group_id = AC_SPM_GROUP_CACHE, + .name = "L2 cache hit", + .desc = "The percentage of read or write requests that hit the data in the " + "L2 cache. The L2 cache is shared by many blocks across the GPU, " + "including the Command Processor, Geometry Engine, all WGPs, all Render " + "Backends, and others. Each request is 128 bytes in size. Value range: 0% " + "(no hit) to 100% (optimal).", + .usage = AC_SPM_USAGE_PERCENTAGE, + .num_components = 3, + .components = { + &gfx10_l2_cache_request_count_comp, + &gfx10_l2_cache_hit_count_comp, + &gfx10_l2_cache_miss_count_comp, + }, +}; + +/* SPM groups. */ +static struct ac_spm_derived_group_descr gfx10_cache_group = { + .id = AC_SPM_GROUP_CACHE, + .name = "Cache", + .num_counters = 5, + .counters = { + &gfx10_inst_cache_hit_counter, + &gfx10_scalar_cache_hit_counter, + &gfx10_l0_cache_hit_counter, + &gfx10_l1_cache_hit_counter, + &gfx10_l2_cache_hit_counter, + }, +}; + +static struct ac_spm_derived_counter * +ac_spm_get_counter_by_id(struct ac_spm_derived_trace *spm_derived_trace, + enum ac_spm_counter_id counter_id) +{ + for (uint32_t i = 0; i < spm_derived_trace->num_counters; i++) { + struct ac_spm_derived_counter *counter = &spm_derived_trace->counters[i]; + + if (counter->descr->id == counter_id) + return counter; + } + + return NULL; +} + +static struct ac_spm_derived_component * +ac_spm_get_component_by_id(struct ac_spm_derived_trace *spm_derived_trace, + enum ac_spm_component_id component_id) +{ + for (uint32_t i = 0; i < spm_derived_trace->num_components; i++) { + struct ac_spm_derived_component *component = &spm_derived_trace->components[i]; + + if (component->descr->id == component_id) + return component; + } + + return NULL; +} + +static void +ac_spm_add_group(struct ac_spm_derived_trace *spm_derived_trace, + const struct ac_spm_derived_group_descr *group_descr) +{ + for (uint32_t i = 0; i < group_descr->num_counters; i++) { + const struct ac_spm_derived_counter_descr *counter_descr = + group_descr->counters[i]; + + for (uint32_t j = 0; j < counter_descr->num_components; j++) { + struct ac_spm_derived_component *component = + &spm_derived_trace->components[spm_derived_trace->num_components++]; + assert(spm_derived_trace->num_components <= AC_SPM_COMPONENT_COUNT); + + component->descr = counter_descr->components[j]; + } + + struct ac_spm_derived_counter *counter = + &spm_derived_trace->counters[spm_derived_trace->num_counters++]; + assert(spm_derived_trace->num_counters <= AC_SPM_COUNTER_COUNT); + counter->descr = counter_descr; + } + + struct ac_spm_derived_group *group = + &spm_derived_trace->groups[spm_derived_trace->num_groups++]; + assert(spm_derived_trace->num_groups <= AC_SPM_GROUP_COUNT); + group->descr = group_descr; +} + +static enum ac_spm_raw_counter_op +ac_spm_get_raw_counter_op(enum ac_spm_raw_counter_id id) +{ + switch (id) { + case AC_SPM_TCP_PERF_SEL_REQ: + case AC_SPM_TCP_PERF_SEL_REQ_MISS: + case AC_SPM_SQC_PERF_SEL_DCACHE_HITS: + case AC_SPM_SQC_PERF_SEL_DCACHE_MISSES: + case AC_SPM_SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE: + case AC_SPM_SQC_PERF_SEL_ICACHE_HITS: + case AC_SPM_SQC_PERF_SEL_ICACHE_MISSES: + case AC_SPM_SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE: + case AC_SPM_GL1C_PERF_SEL_REQ: + case AC_SPM_GL1C_PERF_SEL_REQ_MISS: + case AC_SPM_GL2C_PERF_SEL_REQ: + case AC_SPM_GL2C_PERF_SEL_MISS: + return AC_SPM_RAW_COUNTER_OP_SUM; + default: + UNREACHABLE("Invalid SPM raw counter ID."); + } +} + +struct ac_spm_derived_trace * +ac_spm_get_derived_trace(const struct radeon_info *info, + const struct ac_spm_trace *spm_trace) +{ + uint32_t sample_size_in_bytes = spm_trace->sample_size_in_bytes; + uint8_t *spm_data_ptr = (uint8_t *)spm_trace->ptr; + struct ac_spm_derived_trace *spm_derived_trace; + + spm_derived_trace = calloc(1, sizeof(*spm_derived_trace)); + if (!spm_derived_trace) + return NULL; + + /* Add groups to the trace. */ + ac_spm_add_group(spm_derived_trace, &gfx10_cache_group); + + spm_derived_trace->timestamps = malloc(spm_trace->num_samples * sizeof(uint64_t)); + if (!spm_derived_trace->timestamps) { + free(spm_derived_trace); + return NULL; + } + + /* Skip the reserved 32 bytes of data at beginning. */ + spm_data_ptr += 32; + + /* Collect timestamps. */ + uint64_t sample_size_in_qwords = sample_size_in_bytes / sizeof(uint64_t); + uint64_t *timestamp_ptr = (uint64_t *)spm_data_ptr; + + for (uint32_t i = 0; i < spm_trace->num_samples; i++) { + uint64_t index = i * sample_size_in_qwords; + uint64_t timestamp = timestamp_ptr[index]; + + spm_derived_trace->timestamps[i] = timestamp; + } + + /* Collect raw counter values. */ + uint64_t *raw_counter_values[AC_SPM_RAW_COUNTER_ID_COUNT]; + for (uint32_t i = 0; i < AC_SPM_RAW_COUNTER_ID_COUNT; i++) { + raw_counter_values[i] = calloc(spm_trace->num_samples, sizeof(uint64_t)); + } + + const uint32_t sample_size_in_hwords = sample_size_in_bytes / sizeof(uint16_t); + const uint16_t *counter_values_ptr = (uint16_t *)spm_data_ptr; + + for (uint32_t c = 0; c < spm_trace->num_counters; c++) { + const uint64_t offset = spm_trace->counters[c].offset; + const uint32_t id = spm_trace->counters[c].id; + const enum ac_spm_raw_counter_op op = ac_spm_get_raw_counter_op(id); + + for (uint32_t s = 0; s < spm_trace->num_samples; s++) { + const uint64_t index = offset + (s * sample_size_in_hwords); + const uint16_t value = counter_values_ptr[index]; + + switch (op) { + case AC_SPM_RAW_COUNTER_OP_SUM: + raw_counter_values[id][s] += value; + break; + default: + UNREACHABLE("Invalid SPM raw counter OP.\n"); + } + } + } + +#define GET_COMPONENT(n) \ + struct ac_spm_derived_component *_##n = \ + ac_spm_get_component_by_id(spm_derived_trace, AC_SPM_COMPONENT_##n); +#define GET_COUNTER(n) \ + struct ac_spm_derived_counter *_##n = \ + ac_spm_get_counter_by_id(spm_derived_trace, AC_SPM_COUNTER_##n); + + GET_COUNTER(INST_CACHE_HIT); + GET_COUNTER(SCALAR_CACHE_HIT); + GET_COUNTER(L0_CACHE_HIT); + GET_COUNTER(L1_CACHE_HIT); + GET_COUNTER(L2_CACHE_HIT); + + GET_COMPONENT(INST_CACHE_REQUEST_COUNT); + GET_COMPONENT(INST_CACHE_HIT_COUNT); + GET_COMPONENT(INST_CACHE_MISS_COUNT); + GET_COMPONENT(SCALAR_CACHE_REQUEST_COUNT); + GET_COMPONENT(SCALAR_CACHE_HIT_COUNT); + GET_COMPONENT(SCALAR_CACHE_MISS_COUNT); + GET_COMPONENT(L0_CACHE_REQUEST_COUNT); + GET_COMPONENT(L0_CACHE_HIT_COUNT); + GET_COMPONENT(L0_CACHE_MISS_COUNT); + GET_COMPONENT(L1_CACHE_REQUEST_COUNT); + GET_COMPONENT(L1_CACHE_HIT_COUNT); + GET_COMPONENT(L1_CACHE_MISS_COUNT); + GET_COMPONENT(L2_CACHE_REQUEST_COUNT); + GET_COMPONENT(L2_CACHE_HIT_COUNT); + GET_COMPONENT(L2_CACHE_MISS_COUNT); + +#undef GET_COMPONENT +#undef GET_COUNTER + +#define ADD(id, value) \ + util_dynarray_append(&_##id->values, (double)(value)); + +#define OP_RAW(n) \ + raw_counter_values[AC_SPM_##n][s] +#define OP_SUM2(a, b) \ + raw_counter_values[AC_SPM_##a][s] + \ + raw_counter_values[AC_SPM_##b][s] +#define OP_SUM3(a, b, c) \ + raw_counter_values[AC_SPM_##a][s] + \ + raw_counter_values[AC_SPM_##b][s] + \ + raw_counter_values[AC_SPM_##c][s] +#define OP_SUB2(a, b) \ + raw_counter_values[AC_SPM_##a][s] - \ + raw_counter_values[AC_SPM_##b][s] + + for (uint32_t s = 0; s < spm_trace->num_samples; s++) { + /* Cache group. */ + /* Instruction cache. */ + const double inst_cache_request_count = + OP_SUM3(SQC_PERF_SEL_ICACHE_HITS, SQC_PERF_SEL_ICACHE_MISSES, SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE); + const double inst_cache_hit_count = + OP_RAW(SQC_PERF_SEL_ICACHE_HITS); + const double inst_cache_miss_count = + OP_SUM2(SQC_PERF_SEL_ICACHE_MISSES, SQC_PERF_SEL_ICACHE_MISSES_DUPLICATE); + const double inst_cache_hit = + inst_cache_request_count ? (inst_cache_hit_count / inst_cache_request_count) * 100.0f : 0.0f; + + ADD(INST_CACHE_REQUEST_COUNT, inst_cache_request_count); + ADD(INST_CACHE_HIT_COUNT, inst_cache_hit_count); + ADD(INST_CACHE_MISS_COUNT, inst_cache_miss_count); + ADD(INST_CACHE_HIT, inst_cache_hit); + + /* Scalar cache. */ + const double scalar_cache_request_count = + OP_SUM3(SQC_PERF_SEL_DCACHE_HITS, SQC_PERF_SEL_DCACHE_MISSES, SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE); + const double scalar_cache_hit_count = + OP_RAW(SQC_PERF_SEL_DCACHE_HITS); + const double scalar_cache_miss_count = + OP_SUM2(SQC_PERF_SEL_DCACHE_MISSES, SQC_PERF_SEL_DCACHE_MISSES_DUPLICATE); + const double scalar_cache_hit = + scalar_cache_request_count ? (scalar_cache_hit_count / scalar_cache_request_count) * 100.0f : 0.0f; + + ADD(SCALAR_CACHE_REQUEST_COUNT, scalar_cache_request_count); + ADD(SCALAR_CACHE_HIT_COUNT, scalar_cache_hit_count); + ADD(SCALAR_CACHE_MISS_COUNT, scalar_cache_miss_count); + ADD(SCALAR_CACHE_HIT, scalar_cache_hit); + + /* L0 cache. */ + const double l0_cache_request_count = OP_RAW(TCP_PERF_SEL_REQ); + const double l0_cache_hit_count = OP_SUB2(TCP_PERF_SEL_REQ, TCP_PERF_SEL_REQ_MISS); + const double l0_cache_miss_count = OP_RAW(TCP_PERF_SEL_REQ_MISS); + const double l0_cache_hit = + l0_cache_request_count ? (l0_cache_hit_count / l0_cache_request_count) * 100.0f : 0.0f; + + ADD(L0_CACHE_REQUEST_COUNT, l0_cache_request_count); + ADD(L0_CACHE_HIT_COUNT, l0_cache_hit_count); + ADD(L0_CACHE_MISS_COUNT, l0_cache_miss_count); + ADD(L0_CACHE_HIT, l0_cache_hit); + + /* L1 cache. */ + const double l1_cache_request_count = OP_RAW(GL1C_PERF_SEL_REQ); + const double l1_cache_hit_count = OP_SUB2(GL1C_PERF_SEL_REQ, GL1C_PERF_SEL_REQ_MISS); + const double l1_cache_miss_count = OP_RAW(GL1C_PERF_SEL_REQ_MISS); + const double l1_cache_hit = + l1_cache_request_count ? (l1_cache_hit_count / l1_cache_request_count) * 100.0f : 0.0f; + + ADD(L1_CACHE_REQUEST_COUNT, l1_cache_request_count); + ADD(L1_CACHE_HIT_COUNT, l1_cache_hit_count); + ADD(L1_CACHE_MISS_COUNT, l1_cache_miss_count); + ADD(L1_CACHE_HIT, l1_cache_hit); + + /* L2 cache. */ + const double l2_cache_request_count = OP_RAW(GL2C_PERF_SEL_REQ); + const double l2_cache_hit_count = OP_SUB2(GL2C_PERF_SEL_REQ, GL2C_PERF_SEL_MISS); + const double l2_cache_miss_count = OP_RAW(GL2C_PERF_SEL_MISS); + const double l2_cache_hit = + l2_cache_request_count ? (l2_cache_hit_count / l2_cache_request_count) * 100.0f : 0.0f; + + ADD(L2_CACHE_REQUEST_COUNT, l2_cache_request_count); + ADD(L2_CACHE_HIT_COUNT, l2_cache_hit_count); + ADD(L2_CACHE_MISS_COUNT, l2_cache_miss_count); + ADD(L2_CACHE_HIT, l2_cache_hit); + } + +#undef ADD +#undef OP_RAW +#undef OP_SUM2 +#undef OP_SUM3 +#undef OP_SUB2 + + spm_derived_trace->num_timestamps = spm_trace->num_samples; + spm_derived_trace->sample_interval = spm_trace->sample_interval; + + for (uint32_t i = 0; i < AC_SPM_RAW_COUNTER_ID_COUNT; i++) + free(raw_counter_values[i]); + + return spm_derived_trace; +} + +void +ac_spm_destroy_derived_trace(struct ac_spm_derived_trace *spm_derived_trace) +{ + for (uint32_t i = 0; i < spm_derived_trace->num_components; i++) { + struct ac_spm_derived_component *component = &spm_derived_trace->components[i]; + util_dynarray_fini(&component->values); + } + + for (uint32_t i = 0; i < spm_derived_trace->num_counters; i++) { + struct ac_spm_derived_counter *counter = &spm_derived_trace->counters[i]; + util_dynarray_fini(&counter->values); + } + + free(spm_derived_trace); +} + static void ac_emit_spm_muxsel(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, enum amd_ip_type ip_type, const struct ac_spm *spm) diff --git a/src/amd/common/ac_spm.h b/src/amd/common/ac_spm.h index f21b2c59c7e..47f0915a21b 100644 --- a/src/amd/common/ac_spm.h +++ b/src/amd/common/ac_spm.h @@ -11,6 +11,8 @@ #include "ac_perfcounter.h" +#include "util/u_dynarray.h" + struct ac_cmdbuf; #define AC_SPM_MAX_COUNTER_PER_BLOCK 16 @@ -102,6 +104,10 @@ enum ac_spm_raw_counter_id { AC_SPM_RAW_COUNTER_ID_COUNT, }; +enum ac_spm_raw_counter_op { + AC_SPM_RAW_COUNTER_OP_SUM = 0, +}; + struct ac_spm_counter_descr { enum ac_spm_raw_counter_id id; enum ac_pc_gpu_block gpu_block; @@ -212,6 +218,103 @@ struct ac_spm_trace { uint32_t num_samples; }; +enum ac_spm_group_id { + AC_SPM_GROUP_CACHE, + AC_SPM_GROUP_COUNT, +}; + +enum ac_spm_counter_id { + AC_SPM_COUNTER_INST_CACHE_HIT, + AC_SPM_COUNTER_SCALAR_CACHE_HIT, + AC_SPM_COUNTER_L0_CACHE_HIT, + AC_SPM_COUNTER_L1_CACHE_HIT, /* < GFX12 */ + AC_SPM_COUNTER_L2_CACHE_HIT, + AC_SPM_COUNTER_COUNT, +}; + +enum ac_spm_component_id { + AC_SPM_COMPONENT_INST_CACHE_REQUEST_COUNT, + AC_SPM_COMPONENT_INST_CACHE_HIT_COUNT, + AC_SPM_COMPONENT_INST_CACHE_MISS_COUNT, + AC_SPM_COMPONENT_SCALAR_CACHE_REQUEST_COUNT, + AC_SPM_COMPONENT_SCALAR_CACHE_HIT_COUNT, + AC_SPM_COMPONENT_SCALAR_CACHE_MISS_COUNT, + AC_SPM_COMPONENT_L0_CACHE_REQUEST_COUNT, + AC_SPM_COMPONENT_L0_CACHE_HIT_COUNT, + AC_SPM_COMPONENT_L0_CACHE_MISS_COUNT, + AC_SPM_COMPONENT_L1_CACHE_REQUEST_COUNT, /* < GFX12 */ + AC_SPM_COMPONENT_L1_CACHE_HIT_COUNT, /* < GFX12 */ + AC_SPM_COMPONENT_L1_CACHE_MISS_COUNT, /* < GFX12 */ + AC_SPM_COMPONENT_L2_CACHE_REQUEST_COUNT, + AC_SPM_COMPONENT_L2_CACHE_HIT_COUNT, + AC_SPM_COMPONENT_L2_CACHE_MISS_COUNT, + AC_SPM_COMPONENT_COUNT, +}; + +enum ac_spm_usage_type { + AC_SPM_USAGE_PERCENTAGE = 1, + AC_SPM_USAGE_ITEMS = 5, +}; + +#define AC_SPM_MAX_COMPONENTS_PER_COUNTER 3 +#define AC_SPM_MAX_COUNTERS_PER_GROUP 5 + +struct ac_spm_derived_component_descr { + enum ac_spm_component_id id; + enum ac_spm_counter_id counter_id; + const char *name; + enum ac_spm_usage_type usage; +}; + +struct ac_spm_derived_counter_descr { + enum ac_spm_counter_id id; + enum ac_spm_group_id group_id; + const char *name; + const char *desc; + enum ac_spm_usage_type usage; + uint32_t num_components; + struct ac_spm_derived_component_descr *components[AC_SPM_MAX_COMPONENTS_PER_COUNTER]; +}; + +struct ac_spm_derived_group_descr { + enum ac_spm_group_id id; + const char *name; + uint32_t num_counters; + struct ac_spm_derived_counter_descr *counters[AC_SPM_MAX_COUNTERS_PER_GROUP]; +}; + +struct ac_spm_derived_group { + const struct ac_spm_derived_group_descr *descr; +}; + +struct ac_spm_derived_counter { + const struct ac_spm_derived_counter_descr *descr; + + struct util_dynarray values; +}; + +struct ac_spm_derived_component { + const struct ac_spm_derived_component_descr *descr; + + struct util_dynarray values; +}; + +struct ac_spm_derived_trace { + uint32_t num_timestamps; + uint64_t *timestamps; + + uint32_t num_groups; + struct ac_spm_derived_group groups[AC_SPM_GROUP_COUNT]; + + uint32_t num_counters; + struct ac_spm_derived_counter counters[AC_SPM_COUNTER_COUNT]; + + uint32_t num_components; + struct ac_spm_derived_component components[AC_SPM_COMPONENT_COUNT]; + + uint32_t sample_interval; +}; + bool ac_init_spm(const struct radeon_info *info, const struct ac_perfcounters *pc, struct ac_spm *spm); @@ -219,6 +322,13 @@ void ac_destroy_spm(struct ac_spm *spm); bool ac_spm_get_trace(const struct ac_spm *spm, struct ac_spm_trace *trace); +struct ac_spm_derived_trace * +ac_spm_get_derived_trace(const struct radeon_info *info, + const struct ac_spm_trace *spm_trace); + +void +ac_spm_destroy_derived_trace(struct ac_spm_derived_trace *spm_derived_trace); + void ac_emit_spm_setup(struct ac_cmdbuf *cs, enum amd_gfx_level gfx_level, enum amd_ip_type ip_type, const struct ac_spm *spm,