From 23ecceb160c7dd2e910c773813385487b1f67ca7 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 10 Dec 2020 09:53:51 +0000 Subject: [PATCH] aco: add latency and inverse throughput statistics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Latency is estimanted duration of a single wave, ignoring others in the CU. It is similar to the old cycles statistic except it it's more accurate and considers memory operations. The InvThroughput statistic is a combination of MaxWaves, Latency and the portion of the wave's execution which does not use various resources. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_interface.cpp | 3 +- src/amd/compiler/aco_ir.h | 4 +- src/amd/compiler/aco_statistics.cpp | 474 +++++++++++++++++++++++++++- 3 files changed, 473 insertions(+), 8 deletions(-) diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp index b878ab99ba7..c4c60d43049 100644 --- a/src/amd/compiler/aco_interface.cpp +++ b/src/amd/compiler/aco_interface.cpp @@ -37,7 +37,8 @@ static const std::array statis ret[aco::statistic_instructions] = aco_compiler_statistic_info{"Instructions", "Instruction count"}; ret[aco::statistic_copies] = aco_compiler_statistic_info{"Copies", "Copy instructions created for pseudo-instructions"}; ret[aco::statistic_branches] = aco_compiler_statistic_info{"Branches", "Branch instructions"}; - ret[aco::statistic_cycles] = aco_compiler_statistic_info{"Busy Cycles", "Estimate of busy cycles"}; + ret[aco::statistic_latency] = aco_compiler_statistic_info{"Latency", "Issue cycles plus stall cycles"}; + ret[aco::statistic_inv_throughput] = aco_compiler_statistic_info{"Inverse Throughput", "Estimated busy cycles to execute one wave"}; ret[aco::statistic_vmem_clauses] = aco_compiler_statistic_info{"VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"}; ret[aco::statistic_smem_clauses] = aco_compiler_statistic_info{"SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"}; ret[aco::statistic_vmem_score] = aco_compiler_statistic_info{"VMEM Score", "Average VMEM def-use distances"}; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 375e4e0a797..3926f5ec94f 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -128,6 +128,7 @@ enum class instr_class : uint8_t { vmem = 17, waitcnt = 18, other = 19, + count, }; enum storage_class : uint8_t { @@ -1827,7 +1828,8 @@ enum statistic { statistic_instructions, statistic_copies, statistic_branches, - statistic_cycles, + statistic_latency, + statistic_inv_throughput, statistic_vmem_clauses, statistic_smem_clauses, statistic_vmem_score, diff --git a/src/amd/compiler/aco_statistics.cpp b/src/amd/compiler/aco_statistics.cpp index 15baa264a89..ef0493e03e9 100644 --- a/src/amd/compiler/aco_statistics.cpp +++ b/src/amd/compiler/aco_statistics.cpp @@ -21,6 +21,9 @@ * IN THE SOFTWARE. * */ + +#include + #include "aco_ir.h" #include "util/crc32.h" @@ -36,6 +39,408 @@ void collect_presched_stats(Program *program) program->statistics[statistic_vgpr_presched] = presched_demand.vgpr; } +class BlockCycleEstimator { +public: + enum resource { + null = 0, + scalar, + branch_sendmsg, + valu, + valu_complex, + lds, + export_gds, + vmem, + resource_count, + }; + + BlockCycleEstimator(Program *program_) : program(program_) {} + + Program *program; + + int32_t cur_cycle = 0; + int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0}; + unsigned res_usage[(int)BlockCycleEstimator::resource_count] = {0}; + int32_t reg_available[512] = {0}; + std::deque lgkm; + std::deque exp; + std::deque vm; + std::deque vs; + + unsigned predict_cost(aco_ptr& instr); + void add(aco_ptr& instr); + void join(const BlockCycleEstimator& other); +private: + unsigned get_waitcnt_cost(wait_imm imm); + unsigned get_dependency_cost(aco_ptr& instr); + + void use_resources(aco_ptr& instr); + int32_t cycles_until_res_available(aco_ptr& instr); +}; + +struct wait_counter_info { + wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_) : + vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {} + + unsigned vm; + unsigned exp; + unsigned lgkm; + unsigned vs; +}; + +struct perf_info { + int latency; + + BlockCycleEstimator::resource rsrc0; + unsigned cost0; + + BlockCycleEstimator::resource rsrc1; + unsigned cost1; +}; + +static perf_info get_perf_info(Program *program, aco_ptr& instr) +{ + instr_class cls = instr_info.classes[(int)instr->opcode]; + + #define WAIT(res) BlockCycleEstimator::res, 0 + #define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt + + if (program->chip_class >= GFX10) { + /* fp64 might be incorrect */ + switch (cls) { + case instr_class::valu32: + case instr_class::valu_convert32: + case instr_class::valu_fma: + return {5, WAIT_USE(valu, 1)}; + case instr_class::valu64: + return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)}; + case instr_class::valu_quarter_rate32: + return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)}; + case instr_class::valu_transcendental32: + return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)}; + case instr_class::valu_double: + return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)}; + case instr_class::valu_double_add: + return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)}; + case instr_class::valu_double_convert: + return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)}; + case instr_class::valu_double_transcendental: + return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)}; + case instr_class::salu: + return {2, WAIT_USE(scalar, 1)}; + case instr_class::smem: + return {0, WAIT_USE(scalar, 1)}; + case instr_class::branch: + case instr_class::sendmsg: + return {0, WAIT_USE(branch_sendmsg, 1)}; + case instr_class::ds: + return instr->ds().gds ? + perf_info{0, WAIT_USE(export_gds, 1)} : + perf_info{0, WAIT_USE(lds, 1)}; + case instr_class::exp: + return {0, WAIT_USE(export_gds, 1)}; + case instr_class::vmem: + return {0, WAIT_USE(vmem, 1)}; + case instr_class::barrier: + case instr_class::waitcnt: + case instr_class::other: + default: + return {0}; + } + } else { + switch (cls) { + case instr_class::valu32: + return {4, WAIT_USE(valu, 4)}; + case instr_class::valu_convert32: + return {16, WAIT_USE(valu, 16)}; + case instr_class::valu64: + return {8, WAIT_USE(valu, 8)}; + case instr_class::valu_quarter_rate32: + return {16, WAIT_USE(valu, 16)}; + case instr_class::valu_fma: + return program->dev.has_fast_fma32 ? + perf_info{4, WAIT_USE(valu, 4)} : + perf_info{16, WAIT_USE(valu, 16)}; + case instr_class::valu_transcendental32: + return {16, WAIT_USE(valu, 16)}; + case instr_class::valu_double: + return {64, WAIT_USE(valu, 64)}; + case instr_class::valu_double_add: + return {32, WAIT_USE(valu, 32)}; + case instr_class::valu_double_convert: + return {16, WAIT_USE(valu, 16)}; + case instr_class::valu_double_transcendental: + return {64, WAIT_USE(valu, 64)}; + case instr_class::salu: + return {4, WAIT_USE(scalar, 4)}; + case instr_class::smem: + return {4, WAIT_USE(scalar, 4)}; + case instr_class::branch: + return {8, WAIT_USE(branch_sendmsg, 8)}; + return {4, WAIT_USE(branch_sendmsg, 4)}; + case instr_class::ds: + return instr->ds().gds ? + perf_info{4, WAIT_USE(export_gds, 4)} : + perf_info{4, WAIT_USE(lds, 4)}; + case instr_class::exp: + return {16, WAIT_USE(export_gds, 16)}; + case instr_class::vmem: + return {4, WAIT_USE(vmem, 4)}; + case instr_class::barrier: + case instr_class::waitcnt: + case instr_class::other: + default: + return {4}; + } + } + + #undef WAIT_USE + #undef WAIT +} + +void BlockCycleEstimator::use_resources(aco_ptr& instr) +{ + perf_info perf = get_perf_info(program, instr); + + if (perf.rsrc0 != resource_count) { + res_available[(int)perf.rsrc0] = cur_cycle + perf.cost0; + res_usage[(int)perf.rsrc0] += perf.cost0; + } + + if (perf.rsrc1 != resource_count) { + res_available[(int)perf.rsrc1] = cur_cycle + perf.cost1; + res_usage[(int)perf.rsrc1] += perf.cost1; + } +} + +int32_t BlockCycleEstimator::cycles_until_res_available(aco_ptr& instr) +{ + perf_info perf = get_perf_info(program, instr); + + int32_t cost = 0; + if (perf.rsrc0 != resource_count) + cost = MAX2(cost, res_available[(int)perf.rsrc0] - cur_cycle); + if (perf.rsrc1 != resource_count) + cost = MAX2(cost, res_available[(int)perf.rsrc1] - cur_cycle); + + return cost; +} + +static wait_counter_info get_wait_counter_info(aco_ptr& instr) +{ + /* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance + * depends a lot on the situation. */ + + if (instr->isEXP()) + return wait_counter_info(0, 16, 0, 0); + + if (instr->isFlatLike()) { + unsigned lgkm = instr->isFlat() ? 20 : 0; + if (!instr->definitions.empty()) + return wait_counter_info(230, 0, lgkm, 0); + else + return wait_counter_info(0, 0, lgkm, 230); + } + + if (instr->isSMEM()) { + if (instr->definitions.empty()) + return wait_counter_info(0, 0, 200, 0); + if (instr->operands.empty()) /* s_memtime and s_memrealtime */ + return wait_counter_info(0, 0, 1, 0); + + bool likely_desc_load = instr->operands[0].size() == 2; + bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4); + bool const_offset = instr->operands[1].isConstant() && + (!soe || instr->operands.back().isConstant()); + + if (likely_desc_load || const_offset) + return wait_counter_info(0, 0, 30, 0); /* likely to hit L0 cache */ + + return wait_counter_info(0, 0, 200, 0); + } + + if (instr->format == Format::DS) + return wait_counter_info(0, 0, 20, 0); + + if (instr->isVMEM() && !instr->definitions.empty()) + return wait_counter_info(320, 0, 0, 0); + + if (instr->isVMEM() && instr->definitions.empty()) + return wait_counter_info(0, 0, 0, 320); + + return wait_counter_info(0, 0, 0, 0); +} + +static wait_imm get_wait_imm(Program *program, aco_ptr& instr) +{ + if (instr->opcode == aco_opcode::s_endpgm) { + return wait_imm(0, 0, 0, 0); + } else if (instr->opcode == aco_opcode::s_waitcnt) { + return wait_imm(GFX10_3, instr->sopp().imm); + } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) { + return wait_imm(0, 0, 0, instr->sopk().imm); + } else { + unsigned max_lgkm_cnt = program->chip_class >= GFX10 ? 62 : 14; + unsigned max_exp_cnt = 6; + unsigned max_vm_cnt = program->chip_class >= GFX9 ? 62 : 14; + unsigned max_vs_cnt = 62; + + wait_counter_info wait_info = get_wait_counter_info(instr); + wait_imm imm; + imm.lgkm = wait_info.lgkm ? max_lgkm_cnt : wait_imm::unset_counter; + imm.exp = wait_info.exp ? max_exp_cnt : wait_imm::unset_counter; + imm.vm = wait_info.vm ? max_vm_cnt : wait_imm::unset_counter; + imm.vs = wait_info.vs ? max_vs_cnt : wait_imm::unset_counter; + return imm; + } +} + +unsigned BlockCycleEstimator::get_dependency_cost(aco_ptr& instr) +{ + int deps_available = cur_cycle; + + wait_imm imm = get_wait_imm(program, instr); + if (imm.vm != wait_imm::unset_counter) { + for (int i = 0; i < (int)vm.size() - imm.vm; i++) + deps_available = MAX2(deps_available, vm[i]); + } + if (imm.exp != wait_imm::unset_counter) { + for (int i = 0; i < (int)exp.size() - imm.exp; i++) + deps_available = MAX2(deps_available, exp[i]); + } + if (imm.lgkm != wait_imm::unset_counter) { + for (int i = 0; i < (int)lgkm.size() - imm.lgkm; i++) + deps_available = MAX2(deps_available, lgkm[i]); + } + if (imm.vs != wait_imm::unset_counter) { + for (int i = 0; i < (int)vs.size() - imm.vs; i++) + deps_available = MAX2(deps_available, vs[i]); + } + + if (instr->opcode == aco_opcode::s_endpgm) { + for (unsigned i = 0; i < 512; i++) + deps_available = MAX2(deps_available, reg_available[i]); + } else if (program->chip_class >= GFX10) { + for (Operand& op : instr->operands) { + if (op.isConstant() || op.isUndefined()) + continue; + for (unsigned i = 0; i < op.size(); i++) + deps_available = MAX2(deps_available, reg_available[op.physReg().reg() + i]); + } + } + + if (program->chip_class < GFX10) + deps_available = align(deps_available, 4); + + return deps_available - cur_cycle; +} + +unsigned BlockCycleEstimator::predict_cost(aco_ptr& instr) +{ + int32_t dep = get_dependency_cost(instr); + return dep + std::max(cycles_until_res_available(instr) - dep, 0); +} + +static bool is_vector(aco_opcode op) +{ + switch (instr_info.classes[(int)op]) { + case instr_class::valu32: + case instr_class::valu_convert32: + case instr_class::valu_fma: + case instr_class::valu_double: + case instr_class::valu_double_add: + case instr_class::valu_double_convert: + case instr_class::valu_double_transcendental: + case instr_class::vmem: + case instr_class::ds: + case instr_class::exp: + case instr_class::valu64: + case instr_class::valu_quarter_rate32: + case instr_class::valu_transcendental32: + return true; + default: + return false; + } +} + +void BlockCycleEstimator::add(aco_ptr& instr) +{ + perf_info perf = get_perf_info(program, instr); + + cur_cycle += get_dependency_cost(instr); + + unsigned start; + bool dual_issue = program->chip_class >= GFX10 && program->wave_size == 64 && + is_vector(instr->opcode) && program->workgroup_size > 32; + for (unsigned i = 0; i < (dual_issue ? 2 : 1); i++) { + cur_cycle += cycles_until_res_available(instr); + + start = cur_cycle; + use_resources(instr); + + /* GCN is in-order and doesn't begin the next instruction until the current one finishes */ + cur_cycle += program->chip_class >= GFX10 ? 1 : perf.latency; + } + + wait_imm imm = get_wait_imm(program, instr); + while (lgkm.size() > imm.lgkm) + lgkm.pop_front(); + while (exp.size() > imm.exp) + exp.pop_front(); + while (vm.size() > imm.vm) + vm.pop_front(); + while (vs.size() > imm.vs) + vs.pop_front(); + + wait_counter_info wait_info = get_wait_counter_info(instr); + if (wait_info.exp) + exp.push_back(cur_cycle + wait_info.exp); + if (wait_info.lgkm) + lgkm.push_back(cur_cycle + wait_info.lgkm); + if (wait_info.vm) + vm.push_back(cur_cycle + wait_info.vm); + if (wait_info.vs) + vs.push_back(cur_cycle + wait_info.vs); + + /* This is inaccurate but shouldn't affect anything after waitcnt insertion. + * Before waitcnt insertion, this is necessary to consider memory operations. + */ + int latency = MAX3(wait_info.exp, wait_info.lgkm, wait_info.vm); + int32_t result_available = start + MAX2(perf.latency, latency); + + for (Definition& def : instr->definitions) { + int32_t *available = ®_available[def.physReg().reg()]; + for (unsigned i = 0; i < def.size(); i++) + available[i] = MAX2(available[i], result_available); + } +} + +static void join_queue(std::deque& queue, const std::deque& pred, int cycle_diff) +{ + for (unsigned i = 0; i < MIN2(queue.size(), pred.size()); i++) + queue.rbegin()[i] = MAX2(queue.rbegin()[i], pred.rbegin()[i] + cycle_diff); + for (int i = pred.size() - queue.size() - 1; i >= 0; i--) + queue.push_front(pred[i] + cycle_diff); +} + +void BlockCycleEstimator::join(const BlockCycleEstimator& pred) +{ + assert(cur_cycle == 0); + + for (unsigned i = 0; i < (unsigned)resource_count; i++) { + assert(res_usage[i] == 0); + res_available[i] = MAX2(res_available[i], pred.res_available[i] - pred.cur_cycle); + } + + for (unsigned i = 0; i < 512; i++) + reg_available[i] = MAX2(reg_available[i], + pred.reg_available[i] - pred.cur_cycle + cur_cycle); + + join_queue(lgkm, pred.lgkm, -pred.cur_cycle); + join_queue(exp, pred.exp, -pred.cur_cycle); + join_queue(vm, pred.vm, -pred.cur_cycle); + join_queue(vs, pred.vs, -pred.cur_cycle); +} + /* instructions/branches/vmem_clauses/smem_clauses/cycles */ void collect_preasm_stats(Program *program) { @@ -68,17 +473,74 @@ void collect_preasm_stats(Program *program) program->statistics[statistic_smem_clauses] += smem_clause_res.size(); smem_clause_res.clear(); } - - /* TODO: this incorrectly assumes instructions always take 4 cycles */ - /* assume loops execute 4 times (TODO: it would be nice to be able to consider loop unrolling) */ - unsigned iter = 1 << (block.loop_nest_depth * 2); - unsigned cycles = instr->opcode == aco_opcode::v_mul_lo_u32 ? 16 : 4; - program->statistics[statistic_cycles] += cycles * iter; } program->statistics[statistic_vmem_clauses] += vmem_clause_res.size(); program->statistics[statistic_smem_clauses] += smem_clause_res.size(); } + + double latency = 0; + double usage[(int)BlockCycleEstimator::resource_count] = {0}; + std::vector blocks(program->blocks.size(), program); + + for (Block& block : program->blocks) { + BlockCycleEstimator& block_est = blocks[block.index]; + for (unsigned pred : block.linear_preds) + block_est.join(blocks[pred]); + + for (aco_ptr& instr : block.instructions) + block_est.add(instr); + + /* TODO: it would be nice to be able to consider estimated loop trip + * counts used for loop unrolling. + */ + + /* TODO: estimate the trip_count of divergent loops (those which break + * divergent) higher than of uniform loops + */ + + /* Assume loops execute 8-2 times, uniform branches are taken 50% the time, + * and any lane in the wave takes a side of a divergent branch 75% of the + * time. + */ + double iter = 1.0; + iter *= block.loop_nest_depth > 0 ? 8.0 : 1.0; + iter *= block.loop_nest_depth > 1 ? 4.0 : 1.0; + iter *= block.loop_nest_depth > 2 ? pow(2.0, block.loop_nest_depth - 2) : 1.0; + iter *= pow(0.5, block.uniform_if_depth); + iter *= pow(0.75, block.divergent_if_logical_depth); + + bool divergent_if_linear_else = block.logical_preds.empty() && block.linear_preds.size() == 1 && block.linear_succs.size() == 1 && + program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert); + if (divergent_if_linear_else) + iter *= 0.25; + + latency += block_est.cur_cycle * iter; + for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++) + usage[i] += block_est.res_usage[i] * iter; + } + + /* This likely exaggerates the effectiveness of parallelism because it + * ignores instruction ordering. It can assume there might be SALU/VALU/etc + * work to from other waves while one is idle but that might not be the case + * because those other waves have not reached such a point yet. + */ + + double parallelism = program->num_waves; + for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++) { + if (usage[i] > 0.0) + parallelism = MIN2(parallelism, latency / usage[i]); + } + double waves_per_cycle = 1.0 / latency * parallelism; + double wave64_per_cycle = waves_per_cycle * (program->wave_size / 64.0); + + double max_utilization = 1.0; + if (program->workgroup_size != UINT_MAX) + max_utilization = program->workgroup_size / (double)align(program->workgroup_size, program->wave_size); + wave64_per_cycle *= max_utilization; + + program->statistics[statistic_latency] = round(latency); + program->statistics[statistic_inv_throughput] = round(1.0 / wave64_per_cycle); } void collect_postasm_stats(Program *program, const std::vector& code)