From 23ecceb160c7dd2e910c773813385487b1f67ca7 Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Thu, 10 Dec 2020 09:53:51 +0000
Subject: [PATCH] aco: add latency and inverse throughput statistics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Latency is estimanted duration of a single wave, ignoring others in the
CU. It is similar to the old cycles statistic except it it's more accurate
and considers memory operations.

The InvThroughput statistic is a combination of MaxWaves, Latency and the
portion of the wave's execution which does not use various resources.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8994>
---
 src/amd/compiler/aco_interface.cpp  |   3 +-
 src/amd/compiler/aco_ir.h           |   4 +-
 src/amd/compiler/aco_statistics.cpp | 474 +++++++++++++++++++++++++++-
 3 files changed, 473 insertions(+), 8 deletions(-)
diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp
index b878ab99ba7..c4c60d43049 100644
--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@@ -37,7 +37,8 @@ static const std::array<aco_compiler_statistic_info, aco::num_statistics> statis
    ret[aco::statistic_instructions] = aco_compiler_statistic_info{"Instructions", "Instruction count"};
    ret[aco::statistic_copies] = aco_compiler_statistic_info{"Copies", "Copy instructions created for pseudo-instructions"};
    ret[aco::statistic_branches] = aco_compiler_statistic_info{"Branches", "Branch instructions"};
-   ret[aco::statistic_cycles] = aco_compiler_statistic_info{"Busy Cycles", "Estimate of busy cycles"};
+   ret[aco::statistic_latency] = aco_compiler_statistic_info{"Latency", "Issue cycles plus stall cycles"};
+   ret[aco::statistic_inv_throughput] = aco_compiler_statistic_info{"Inverse Throughput", "Estimated busy cycles to execute one wave"};
    ret[aco::statistic_vmem_clauses] = aco_compiler_statistic_info{"VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"};
    ret[aco::statistic_smem_clauses] = aco_compiler_statistic_info{"SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"};
    ret[aco::statistic_vmem_score] = aco_compiler_statistic_info{"VMEM Score", "Average VMEM def-use distances"};
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 375e4e0a797..3926f5ec94f 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -128,6 +128,7 @@ enum class instr_class : uint8_t {
    vmem = 17,
    waitcnt = 18,
    other = 19,
+   count,
 };
 
 enum storage_class : uint8_t {
@@ -1827,7 +1828,8 @@ enum statistic {
    statistic_instructions,
    statistic_copies,
    statistic_branches,
-   statistic_cycles,
+   statistic_latency,
+   statistic_inv_throughput,
    statistic_vmem_clauses,
    statistic_smem_clauses,
    statistic_vmem_score,
diff --git a/src/amd/compiler/aco_statistics.cpp b/src/amd/compiler/aco_statistics.cpp
index 15baa264a89..ef0493e03e9 100644
--- a/src/amd/compiler/aco_statistics.cpp
+++ b/src/amd/compiler/aco_statistics.cpp
@@ -21,6 +21,9 @@
  * IN THE SOFTWARE.
  *
  */
+ 
+#include <deque>
+
 #include "aco_ir.h"
 #include "util/crc32.h"
 
@@ -36,6 +39,408 @@ void collect_presched_stats(Program *program)
    program->statistics[statistic_vgpr_presched] = presched_demand.vgpr;
 }
 
+class BlockCycleEstimator {
+public:
+   enum resource {
+      null = 0,
+      scalar,
+      branch_sendmsg,
+      valu,
+      valu_complex,
+      lds,
+      export_gds,
+      vmem,
+      resource_count,
+   };
+
+   BlockCycleEstimator(Program *program_) : program(program_) {}
+
+   Program *program;
+
+   int32_t cur_cycle = 0;
+   int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0};
+   unsigned res_usage[(int)BlockCycleEstimator::resource_count] = {0};
+   int32_t reg_available[512] = {0};
+   std::deque<int32_t> lgkm;
+   std::deque<int32_t> exp;
+   std::deque<int32_t> vm;
+   std::deque<int32_t> vs;
+
+   unsigned predict_cost(aco_ptr<Instruction>& instr);
+   void add(aco_ptr<Instruction>& instr);
+   void join(const BlockCycleEstimator& other);
+private:
+   unsigned get_waitcnt_cost(wait_imm imm);
+   unsigned get_dependency_cost(aco_ptr<Instruction>& instr);
+
+   void use_resources(aco_ptr<Instruction>& instr);
+   int32_t cycles_until_res_available(aco_ptr<Instruction>& instr);
+};
+
+struct wait_counter_info {
+   wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_) :
+      vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {}
+
+   unsigned vm;
+   unsigned exp;
+   unsigned lgkm;
+   unsigned vs;
+};
+
+struct perf_info {
+   int latency;
+
+   BlockCycleEstimator::resource rsrc0;
+   unsigned cost0;
+
+   BlockCycleEstimator::resource rsrc1;
+   unsigned cost1;
+};
+
+static perf_info get_perf_info(Program *program, aco_ptr<Instruction>& instr)
+{
+   instr_class cls = instr_info.classes[(int)instr->opcode];
+
+   #define WAIT(res) BlockCycleEstimator::res, 0
+   #define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt
+
+   if (program->chip_class >= GFX10) {
+      /* fp64 might be incorrect */
+      switch (cls) {
+      case instr_class::valu32:
+      case instr_class::valu_convert32:
+      case instr_class::valu_fma:
+         return {5, WAIT_USE(valu, 1)};
+      case instr_class::valu64:
+         return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
+      case instr_class::valu_quarter_rate32:
+         return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)};
+      case instr_class::valu_transcendental32:
+         return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)};
+      case instr_class::valu_double:
+         return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
+      case instr_class::valu_double_add:
+         return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
+      case instr_class::valu_double_convert:
+         return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
+      case instr_class::valu_double_transcendental:
+         return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
+      case instr_class::salu:
+         return {2, WAIT_USE(scalar, 1)};
+      case instr_class::smem:
+         return {0, WAIT_USE(scalar, 1)};
+      case instr_class::branch:
+      case instr_class::sendmsg:
+         return {0, WAIT_USE(branch_sendmsg, 1)};
+      case instr_class::ds:
+         return instr->ds().gds ?
+                perf_info{0, WAIT_USE(export_gds, 1)} :
+                perf_info{0, WAIT_USE(lds, 1)};
+      case instr_class::exp:
+         return {0, WAIT_USE(export_gds, 1)};
+      case instr_class::vmem:
+         return {0, WAIT_USE(vmem, 1)};
+      case instr_class::barrier:
+      case instr_class::waitcnt:
+      case instr_class::other:
+      default:
+         return {0};
+      }
+   } else {
+      switch (cls) {
+      case instr_class::valu32:
+         return {4, WAIT_USE(valu, 4)};
+      case instr_class::valu_convert32:
+         return {16, WAIT_USE(valu, 16)};
+      case instr_class::valu64:
+         return {8, WAIT_USE(valu, 8)};
+      case instr_class::valu_quarter_rate32:
+         return {16, WAIT_USE(valu, 16)};
+      case instr_class::valu_fma:
+         return program->dev.has_fast_fma32 ?
+                perf_info{4, WAIT_USE(valu, 4)} :
+                perf_info{16, WAIT_USE(valu, 16)};
+      case instr_class::valu_transcendental32:
+         return {16, WAIT_USE(valu, 16)};
+      case instr_class::valu_double:
+         return {64, WAIT_USE(valu, 64)};
+      case instr_class::valu_double_add:
+         return {32, WAIT_USE(valu, 32)};
+      case instr_class::valu_double_convert:
+         return {16, WAIT_USE(valu, 16)};
+      case instr_class::valu_double_transcendental:
+         return {64, WAIT_USE(valu, 64)};
+      case instr_class::salu:
+         return {4, WAIT_USE(scalar, 4)};
+      case instr_class::smem:
+         return {4, WAIT_USE(scalar, 4)};
+      case instr_class::branch:
+         return {8, WAIT_USE(branch_sendmsg, 8)};
+         return {4, WAIT_USE(branch_sendmsg, 4)};
+      case instr_class::ds:
+         return instr->ds().gds ?
+                perf_info{4, WAIT_USE(export_gds, 4)} :
+                perf_info{4, WAIT_USE(lds, 4)};
+      case instr_class::exp:
+         return {16, WAIT_USE(export_gds, 16)};
+      case instr_class::vmem:
+         return {4, WAIT_USE(vmem, 4)};
+      case instr_class::barrier:
+      case instr_class::waitcnt:
+      case instr_class::other:
+      default:
+         return {4};
+      }
+   }
+
+   #undef WAIT_USE
+   #undef WAIT
+}
+
+void BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
+{
+   perf_info perf = get_perf_info(program, instr);
+
+   if (perf.rsrc0 != resource_count) {
+      res_available[(int)perf.rsrc0] = cur_cycle + perf.cost0;
+      res_usage[(int)perf.rsrc0] += perf.cost0;
+   }
+
+   if (perf.rsrc1 != resource_count) {
+      res_available[(int)perf.rsrc1] = cur_cycle + perf.cost1;
+      res_usage[(int)perf.rsrc1] += perf.cost1;
+   }
+}
+
+int32_t BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
+{
+   perf_info perf = get_perf_info(program, instr);
+
+   int32_t cost = 0;
+   if (perf.rsrc0 != resource_count)
+      cost = MAX2(cost, res_available[(int)perf.rsrc0] - cur_cycle);
+   if (perf.rsrc1 != resource_count)
+      cost = MAX2(cost, res_available[(int)perf.rsrc1] - cur_cycle);
+
+   return cost;
+}
+
+static wait_counter_info get_wait_counter_info(aco_ptr<Instruction>& instr)
+{
+   /* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance
+    * depends a lot on the situation. */
+
+   if (instr->isEXP())
+      return wait_counter_info(0, 16, 0, 0);
+
+   if (instr->isFlatLike()) {
+      unsigned lgkm = instr->isFlat() ? 20 : 0;
+      if (!instr->definitions.empty())
+         return wait_counter_info(230, 0, lgkm, 0);
+      else
+         return wait_counter_info(0, 0, lgkm, 230);
+   }
+
+   if (instr->isSMEM()) {
+      if (instr->definitions.empty())
+         return wait_counter_info(0, 0, 200, 0);
+      if (instr->operands.empty()) /* s_memtime and s_memrealtime */
+         return wait_counter_info(0, 0, 1, 0);
+
+      bool likely_desc_load = instr->operands[0].size() == 2;
+      bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
+      bool const_offset = instr->operands[1].isConstant() &&
+                          (!soe || instr->operands.back().isConstant());
+
+      if (likely_desc_load || const_offset)
+         return wait_counter_info(0, 0, 30, 0); /* likely to hit L0 cache */
+
+      return wait_counter_info(0, 0, 200, 0);
+   }
+
+   if (instr->format == Format::DS)
+      return wait_counter_info(0, 0, 20, 0);
+
+   if (instr->isVMEM() && !instr->definitions.empty())
+      return wait_counter_info(320, 0, 0, 0);
+
+   if (instr->isVMEM() && instr->definitions.empty())
+      return wait_counter_info(0, 0, 0, 320);
+
+   return wait_counter_info(0, 0, 0, 0);
+}
+
+static wait_imm get_wait_imm(Program *program, aco_ptr<Instruction>& instr)
+{
+   if (instr->opcode == aco_opcode::s_endpgm) {
+      return wait_imm(0, 0, 0, 0);
+   } else if (instr->opcode == aco_opcode::s_waitcnt) {
+      return wait_imm(GFX10_3, instr->sopp().imm);
+   } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) {
+      return wait_imm(0, 0, 0, instr->sopk().imm);
+   } else {
+      unsigned max_lgkm_cnt = program->chip_class >= GFX10 ? 62 : 14;
+      unsigned max_exp_cnt = 6;
+      unsigned max_vm_cnt = program->chip_class >= GFX9 ? 62 : 14;
+      unsigned max_vs_cnt = 62;
+
+      wait_counter_info wait_info = get_wait_counter_info(instr);
+      wait_imm imm;
+      imm.lgkm = wait_info.lgkm ? max_lgkm_cnt : wait_imm::unset_counter;
+      imm.exp = wait_info.exp ? max_exp_cnt : wait_imm::unset_counter;
+      imm.vm = wait_info.vm ? max_vm_cnt : wait_imm::unset_counter;
+      imm.vs = wait_info.vs ? max_vs_cnt : wait_imm::unset_counter;
+      return imm;
+   }
+}
+
+unsigned BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
+{
+   int deps_available = cur_cycle;
+
+   wait_imm imm = get_wait_imm(program, instr);
+   if (imm.vm != wait_imm::unset_counter) {
+      for (int i = 0; i < (int)vm.size() - imm.vm; i++)
+         deps_available = MAX2(deps_available, vm[i]);
+   }
+   if (imm.exp != wait_imm::unset_counter) {
+      for (int i = 0; i < (int)exp.size() - imm.exp; i++)
+         deps_available = MAX2(deps_available, exp[i]);
+   }
+   if (imm.lgkm != wait_imm::unset_counter) {
+      for (int i = 0; i < (int)lgkm.size() - imm.lgkm; i++)
+         deps_available = MAX2(deps_available, lgkm[i]);
+   }
+   if (imm.vs != wait_imm::unset_counter) {
+      for (int i = 0; i < (int)vs.size() - imm.vs; i++)
+         deps_available = MAX2(deps_available, vs[i]);
+   }
+
+   if (instr->opcode == aco_opcode::s_endpgm) {
+      for (unsigned i = 0; i < 512; i++)
+         deps_available = MAX2(deps_available, reg_available[i]);
+   } else if (program->chip_class >= GFX10) {
+      for (Operand& op : instr->operands) {
+         if (op.isConstant() || op.isUndefined())
+            continue;
+         for (unsigned i = 0; i < op.size(); i++)
+            deps_available = MAX2(deps_available, reg_available[op.physReg().reg() + i]);
+      }
+   }
+
+   if (program->chip_class < GFX10)
+      deps_available = align(deps_available, 4);
+
+   return deps_available - cur_cycle;
+}
+
+unsigned BlockCycleEstimator::predict_cost(aco_ptr<Instruction>& instr)
+{
+   int32_t dep = get_dependency_cost(instr);
+   return dep + std::max(cycles_until_res_available(instr) - dep, 0);
+}
+
+static bool is_vector(aco_opcode op)
+{
+   switch (instr_info.classes[(int)op]) {
+   case instr_class::valu32:
+   case instr_class::valu_convert32:
+   case instr_class::valu_fma:
+   case instr_class::valu_double:
+   case instr_class::valu_double_add:
+   case instr_class::valu_double_convert:
+   case instr_class::valu_double_transcendental:
+   case instr_class::vmem:
+   case instr_class::ds:
+   case instr_class::exp:
+   case instr_class::valu64:
+   case instr_class::valu_quarter_rate32:
+   case instr_class::valu_transcendental32:
+      return true;
+   default:
+      return false;
+   }
+}
+
+void BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
+{
+   perf_info perf = get_perf_info(program, instr);
+
+   cur_cycle += get_dependency_cost(instr);
+
+   unsigned start;
+   bool dual_issue = program->chip_class >= GFX10 && program->wave_size == 64 &&
+                     is_vector(instr->opcode) && program->workgroup_size > 32;
+   for (unsigned i = 0; i < (dual_issue ? 2 : 1); i++) {
+      cur_cycle += cycles_until_res_available(instr);
+
+      start = cur_cycle;
+      use_resources(instr);
+
+      /* GCN is in-order and doesn't begin the next instruction until the current one finishes */
+      cur_cycle += program->chip_class >= GFX10 ? 1 : perf.latency;
+   }
+
+   wait_imm imm = get_wait_imm(program, instr);
+   while (lgkm.size() > imm.lgkm)
+      lgkm.pop_front();
+   while (exp.size() > imm.exp)
+      exp.pop_front();
+   while (vm.size() > imm.vm)
+      vm.pop_front();
+   while (vs.size() > imm.vs)
+      vs.pop_front();
+
+   wait_counter_info wait_info = get_wait_counter_info(instr);
+   if (wait_info.exp)
+      exp.push_back(cur_cycle + wait_info.exp);
+   if (wait_info.lgkm)
+      lgkm.push_back(cur_cycle + wait_info.lgkm);
+   if (wait_info.vm)
+      vm.push_back(cur_cycle + wait_info.vm);
+   if (wait_info.vs)
+      vs.push_back(cur_cycle + wait_info.vs);
+
+   /* This is inaccurate but shouldn't affect anything after waitcnt insertion.
+    * Before waitcnt insertion, this is necessary to consider memory operations.
+    */
+   int latency = MAX3(wait_info.exp, wait_info.lgkm, wait_info.vm);
+   int32_t result_available = start + MAX2(perf.latency, latency);
+
+   for (Definition& def : instr->definitions) {
+      int32_t *available = &reg_available[def.physReg().reg()];
+      for (unsigned i = 0; i < def.size(); i++)
+         available[i] = MAX2(available[i], result_available);
+   }
+}
+
+static void join_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pred, int cycle_diff)
+{
+   for (unsigned i = 0; i < MIN2(queue.size(), pred.size()); i++)
+      queue.rbegin()[i] = MAX2(queue.rbegin()[i], pred.rbegin()[i] + cycle_diff);
+   for (int i = pred.size() - queue.size() - 1; i >= 0; i--)
+      queue.push_front(pred[i] + cycle_diff);
+}
+
+void BlockCycleEstimator::join(const BlockCycleEstimator& pred)
+{
+   assert(cur_cycle == 0);
+
+   for (unsigned i = 0; i < (unsigned)resource_count; i++) {
+      assert(res_usage[i] == 0);
+      res_available[i] = MAX2(res_available[i], pred.res_available[i] - pred.cur_cycle);
+   }
+
+   for (unsigned i = 0; i < 512; i++)
+      reg_available[i] = MAX2(reg_available[i],
+                              pred.reg_available[i] - pred.cur_cycle + cur_cycle);
+
+   join_queue(lgkm, pred.lgkm, -pred.cur_cycle);
+   join_queue(exp, pred.exp, -pred.cur_cycle);
+   join_queue(vm, pred.vm, -pred.cur_cycle);
+   join_queue(vs, pred.vs, -pred.cur_cycle);
+}
+
 /* instructions/branches/vmem_clauses/smem_clauses/cycles */
 void collect_preasm_stats(Program *program)
 {
@@ -68,17 +473,74 @@ void collect_preasm_stats(Program *program)
             program->statistics[statistic_smem_clauses] += smem_clause_res.size();
             smem_clause_res.clear();
           }
-
-         /* TODO: this incorrectly assumes instructions always take 4 cycles */
-         /* assume loops execute 4 times (TODO: it would be nice to be able to consider loop unrolling) */
-         unsigned iter = 1 << (block.loop_nest_depth * 2);
-         unsigned cycles = instr->opcode == aco_opcode::v_mul_lo_u32 ? 16 : 4;
-         program->statistics[statistic_cycles] += cycles * iter;
       }
 
       program->statistics[statistic_vmem_clauses] += vmem_clause_res.size();
       program->statistics[statistic_smem_clauses] += smem_clause_res.size();
    }
+
+   double latency = 0;
+   double usage[(int)BlockCycleEstimator::resource_count] = {0};
+   std::vector<BlockCycleEstimator> blocks(program->blocks.size(), program);
+
+   for (Block& block : program->blocks) {
+      BlockCycleEstimator& block_est = blocks[block.index];
+      for (unsigned pred : block.linear_preds)
+         block_est.join(blocks[pred]);
+
+      for (aco_ptr<Instruction>& instr : block.instructions)
+         block_est.add(instr);
+
+      /* TODO: it would be nice to be able to consider estimated loop trip
+       * counts used for loop unrolling.
+       */
+
+      /* TODO: estimate the trip_count of divergent loops (those which break
+       * divergent) higher than of uniform loops
+       */
+
+      /* Assume loops execute 8-2 times, uniform branches are taken 50% the time,
+       * and any lane in the wave takes a side of a divergent branch 75% of the
+       * time.
+       */
+      double iter = 1.0;
+      iter *= block.loop_nest_depth > 0 ? 8.0 : 1.0;
+      iter *= block.loop_nest_depth > 1 ? 4.0 : 1.0;
+      iter *= block.loop_nest_depth > 2 ? pow(2.0, block.loop_nest_depth - 2) : 1.0;
+      iter *= pow(0.5, block.uniform_if_depth);
+      iter *= pow(0.75, block.divergent_if_logical_depth);
+
+      bool divergent_if_linear_else = block.logical_preds.empty() && block.linear_preds.size() == 1 && block.linear_succs.size() == 1 &&
+                                      program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);
+      if (divergent_if_linear_else)
+         iter *= 0.25;
+
+      latency += block_est.cur_cycle * iter;
+      for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++)
+         usage[i] += block_est.res_usage[i] * iter;
+   }
+
+   /* This likely exaggerates the effectiveness of parallelism because it
+    * ignores instruction ordering. It can assume there might be SALU/VALU/etc
+    * work to from other waves while one is idle but that might not be the case
+    * because those other waves have not reached such a point yet.
+    */
+
+   double parallelism = program->num_waves;
+   for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++) {
+      if (usage[i] > 0.0)
+         parallelism = MIN2(parallelism, latency / usage[i]);
+   }
+   double waves_per_cycle = 1.0 / latency * parallelism;
+   double wave64_per_cycle = waves_per_cycle * (program->wave_size / 64.0);
+
+   double max_utilization = 1.0;
+   if (program->workgroup_size != UINT_MAX)
+      max_utilization = program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
+   wave64_per_cycle *= max_utilization;
+
+   program->statistics[statistic_latency] = round(latency);
+   program->statistics[statistic_inv_throughput] = round(1.0 / wave64_per_cycle);
 }
 
 void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code)