diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp
index 78162a96d61..c2f9b680fa7 100644
--- a/src/amd/compiler/aco_insert_NOPs.cpp
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
@@ -898,14 +898,14 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>&
       if (instr->isFlat() || instr->isDS())
          mark_read_regs_exec(state, instr, ctx.sgprs_read_by_DS);
    } else if (instr->isSALU() || instr->isSMEM()) {
-      if (instr->opcode == aco_opcode::s_waitcnt) {
-         wait_imm imm(state.program->gfx_level, instr->salu().imm);
+      wait_imm imm;
+      if (imm.unpack(state.program->gfx_level, instr.get())) {
          if (imm.vm == 0)
             ctx.sgprs_read_by_VMEM.reset();
          if (imm.lgkm == 0)
             ctx.sgprs_read_by_DS.reset();
-      } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt && instr->salu().imm == 0) {
-         ctx.sgprs_read_by_VMEM_store.reset();
+         if (imm.vs == 0)
+            ctx.sgprs_read_by_VMEM_store.reset();
       } else if (vm_vsrc == 0) {
          ctx.sgprs_read_by_VMEM.reset();
          ctx.sgprs_read_by_DS.reset();
@@ -981,15 +981,10 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>&
          bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero());
       }
    } else if (instr->isSALU()) {
-      /* Reducing lgkmcnt count to 0 always mitigates the hazard. */
-      if (instr->opcode == aco_opcode::s_waitcnt_lgkmcnt) {
-         const SALU_instruction& sopk = instr->salu();
-         if (sopk.imm == 0 && sopk.operands[0].physReg() == sgpr_null)
-            ctx.sgprs_read_by_SMEM.reset();
-      } else if (instr->opcode == aco_opcode::s_waitcnt) {
-         wait_imm imm(state.program->gfx_level, instr->salu().imm);
-         if (imm.lgkm == 0)
-            ctx.sgprs_read_by_SMEM.reset();
+      wait_imm imm;
+      if (imm.unpack(state.program->gfx_level, instr.get()) && imm.lgkm == 0) {
+         /* Reducing lgkmcnt count to 0 always mitigates the hazard. */
+         ctx.sgprs_read_by_SMEM.reset();
       } else if (instr->format != Format::SOPP && instr->definitions.size()) {
          /* SALU can mitigate the hazard */
          ctx.sgprs_read_by_SMEM.reset();
@@ -1515,18 +1510,18 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
       for (Operand& op : instr->operands)
          fill_vgpr_bitset(ctx.vgpr_used_by_ds, op.physReg(), op.bytes());
    }
+   wait_imm imm;
    if (instr->isVALU() || instr->isEXP() || vm_vsrc == 0) {
       ctx.vgpr_used_by_vmem_load.reset();
       ctx.vgpr_used_by_vmem_store.reset();
       ctx.vgpr_used_by_ds.reset();
-   } else if (instr->opcode == aco_opcode::s_waitcnt) {
-      wait_imm imm(GFX11, instr->salu().imm);
+   } else if (imm.unpack(state.program->gfx_level, instr.get())) {
       if (imm.vm == 0)
          ctx.vgpr_used_by_vmem_load.reset();
       if (imm.lgkm == 0)
          ctx.vgpr_used_by_ds.reset();
-   } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt && instr->salu().imm == 0) {
-      ctx.vgpr_used_by_vmem_store.reset();
+      if (imm.vs == 0)
+         ctx.vgpr_used_by_vmem_store.reset();
    }
    if (instr->isLDSDIR()) {
       if (ctx.vgpr_used_by_vmem_load[instr->definitions[0].physReg().reg() - 256] ||
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
index a40c7e357fd..b9d25db4f1c 100644
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -228,16 +228,15 @@ struct wait_entry {
 };
 
 struct target_info {
-   uint16_t max_cnt[wait_type_num] = {};
+   wait_imm max_cnt;
    uint32_t events[wait_type_num] = {};
    uint16_t unordered_events;
 
    target_info(enum amd_gfx_level gfx_level)
    {
-      max_cnt[wait_type_vm] = gfx_level >= GFX9 ? 62 : 14;
-      max_cnt[wait_type_exp] = 6;
-      max_cnt[wait_type_lgkm] = gfx_level >= GFX10 ? 62 : 14;
-      max_cnt[wait_type_vs] = gfx_level >= GFX10 ? 62 : 0;
+      max_cnt = wait_imm::max(gfx_level);
+      for (unsigned i = 0; i < wait_type_num; i++)
+         max_cnt[i] = max_cnt[i] ? max_cnt[i] - 1 : 0;
 
       events[wait_type_exp] = event_exp_pos | event_exp_param | event_exp_mrt_null |
                               event_gds_gpr_lock | event_vmem_gpr_lock | event_ldsdir;
@@ -402,19 +401,6 @@ check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* i
    }
 }
 
-bool
-parse_wait_instr(wait_ctx& ctx, wait_imm& imm, Instruction* instr)
-{
-   if (instr->opcode == aco_opcode::s_waitcnt_vscnt && instr->operands[0].physReg() == sgpr_null) {
-      imm.vs = std::min<uint8_t>(imm.vs, instr->salu().imm);
-      return true;
-   } else if (instr->opcode == aco_opcode::s_waitcnt) {
-      imm.combine(wait_imm(ctx.gfx_level, instr->salu().imm));
-      return true;
-   }
-   return false;
-}
-
 bool
 parse_delay_alu(wait_ctx& ctx, alu_delay_info& delay, Instruction* instr)
 {
@@ -962,7 +948,7 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
    for (size_t i = 0; i < block.instructions.size(); i++) {
       aco_ptr<Instruction>& instr = block.instructions[i];
 
-      bool is_wait = parse_wait_instr(ctx, queued_imm, instr.get());
+      bool is_wait = queued_imm.unpack(ctx.gfx_level, instr.get());
       bool is_delay_alu = parse_delay_alu(ctx, queued_delay, instr.get());
 
       memory_sync_info sync_info = get_sync_info(instr.get());
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp
index 30e424e71a9..2c131d39c82 100644
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -1200,32 +1200,6 @@ wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
     : exp(exp_), lgkm(lgkm_), vm(vm_), vs(vs_)
 {}
 
-wait_imm::wait_imm(enum amd_gfx_level gfx_level, uint16_t packed) : vs(unset_counter)
-{
-   if (gfx_level >= GFX11) {
-      vm = (packed >> 10) & 0x3f;
-      lgkm = (packed >> 4) & 0x3f;
-      exp = packed & 0x7;
-   } else {
-      vm = packed & 0xf;
-      if (gfx_level >= GFX9)
-         vm |= (packed >> 10) & 0x30;
-
-      exp = (packed >> 4) & 0x7;
-
-      lgkm = (packed >> 8) & 0xf;
-      if (gfx_level >= GFX10)
-         lgkm |= (packed >> 8) & 0x30;
-   }
-
-   if (vm == (gfx_level >= GFX9 ? 0x3f : 0xf))
-      vm = wait_imm::unset_counter;
-   if (exp == 0x7)
-      exp = wait_imm::unset_counter;
-   if (lgkm == (gfx_level >= GFX10 ? 0x3f : 0xf))
-      lgkm = wait_imm::unset_counter;
-}
-
 uint16_t
 wait_imm::pack(enum amd_gfx_level gfx_level) const
 {
@@ -1257,6 +1231,68 @@ wait_imm::pack(enum amd_gfx_level gfx_level) const
    return imm;
 }
 
+wait_imm
+wait_imm::max(enum amd_gfx_level gfx_level)
+{
+   wait_imm imm;
+   imm.vm = gfx_level >= GFX9 ? 63 : 15;
+   imm.exp = 7;
+   imm.lgkm = gfx_level >= GFX10 ? 63 : 15;
+   imm.vs = gfx_level >= GFX10 ? 63 : 0;
+   return imm;
+}
+
+bool
+wait_imm::unpack(enum amd_gfx_level gfx_level, const Instruction* instr)
+{
+   if (!instr->isSALU() || (!instr->operands.empty() && instr->operands[0].physReg() != sgpr_null))
+      return false;
+
+   aco_opcode op = instr->opcode;
+   uint16_t packed = instr->salu().imm;
+
+   if (op == aco_opcode::s_waitcnt_expcnt) {
+      exp = std::min<uint8_t>(exp, packed);
+   } else if (op == aco_opcode::s_waitcnt_lgkmcnt) {
+      lgkm = std::min<uint8_t>(lgkm, packed);
+   } else if (op == aco_opcode::s_waitcnt_vmcnt) {
+      vm = std::min<uint8_t>(vm, packed);
+   } else if (op == aco_opcode::s_waitcnt_vscnt) {
+      vs = std::min<uint8_t>(vs, packed);
+   } else if (op == aco_opcode::s_waitcnt) {
+      uint8_t vm2, lgkm2, exp2;
+      if (gfx_level >= GFX11) {
+         vm2 = (packed >> 10) & 0x3f;
+         lgkm2 = (packed >> 4) & 0x3f;
+         exp2 = packed & 0x7;
+      } else {
+         vm2 = packed & 0xf;
+         if (gfx_level >= GFX9)
+            vm2 |= (packed >> 10) & 0x30;
+
+         exp2 = (packed >> 4) & 0x7;
+
+         lgkm2 = (packed >> 8) & 0xf;
+         if (gfx_level >= GFX10)
+            lgkm2 |= (packed >> 8) & 0x30;
+      }
+
+      if (vm2 == (gfx_level >= GFX9 ? 0x3f : 0xf))
+         vm2 = wait_imm::unset_counter;
+      if (exp2 == 0x7)
+         exp2 = wait_imm::unset_counter;
+      if (lgkm2 == (gfx_level >= GFX10 ? 0x3f : 0xf))
+         lgkm2 = wait_imm::unset_counter;
+
+      vm = std::min(vm, vm2);
+      exp = std::min(exp, exp2);
+      lgkm = std::min(lgkm, lgkm2);
+   } else {
+      return false;
+   }
+   return true;
+}
+
 bool
 wait_imm::combine(const wait_imm& other)
 {
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 8746e80e077..2ced8b25b37 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -182,6 +182,8 @@ enum wait_type {
    wait_type_num = 4,
 };
 
+struct Instruction;
+
 struct wait_imm {
    static const uint8_t unset_counter = 0xff;
 
@@ -192,10 +194,13 @@ struct wait_imm {
 
    wait_imm();
    wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_);
-   wait_imm(enum amd_gfx_level chip, uint16_t packed);
 
    uint16_t pack(enum amd_gfx_level chip) const;
 
+   static wait_imm max(enum amd_gfx_level gfx_level);
+
+   bool unpack(enum amd_gfx_level gfx_level, const Instruction* instr);
+
    bool combine(const wait_imm& other);
 
    bool empty() const;
diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp
index 66bb079b2cf..3fe100d8e8d 100644
--- a/src/amd/compiler/aco_print_ir.cpp
+++ b/src/amd/compiler/aco_print_ir.cpp
@@ -275,13 +275,17 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins
       uint16_t imm = instr->salu().imm;
       switch (instr->opcode) {
       case aco_opcode::s_waitcnt: {
-         wait_imm unpacked(gfx_level, imm);
-         if (unpacked.vm != wait_imm::unset_counter)
-            fprintf(output, " vmcnt(%d)", unpacked.vm);
-         if (unpacked.exp != wait_imm::unset_counter)
-            fprintf(output, " expcnt(%d)", unpacked.exp);
-         if (unpacked.lgkm != wait_imm::unset_counter)
-            fprintf(output, " lgkmcnt(%d)", unpacked.lgkm);
+         wait_imm unpacked;
+         unpacked.unpack(gfx_level, instr);
+         const char* names[wait_type_num];
+         names[wait_type_exp] = "expcnt";
+         names[wait_type_vm] = "vmcnt";
+         names[wait_type_lgkm] = "lgkmcnt";
+         names[wait_type_vs] = "vscnt";
+         for (unsigned i = 0; i < wait_type_num; i++) {
+            if (unpacked[i] != wait_imm::unset_counter)
+               fprintf(output, " %s(%d)", names[i], unpacked[i]);
+         }
          break;
       }
       case aco_opcode::s_waitcnt_depctr: {
diff --git a/src/amd/compiler/aco_statistics.cpp b/src/amd/compiler/aco_statistics.cpp
index d9ba0b8c25a..65777f8fb6b 100644
--- a/src/amd/compiler/aco_statistics.cpp
+++ b/src/amd/compiler/aco_statistics.cpp
@@ -318,28 +318,20 @@ get_wait_imm(Program* program, aco_ptr<Instruction>& instr)
    if (instr->opcode == aco_opcode::s_endpgm) {
       for (unsigned i = 0; i < wait_type_num; i++)
          imm[i] = 0;
-   } else if (instr->opcode == aco_opcode::s_waitcnt) {
-      return wait_imm(program->gfx_level, instr->salu().imm);
-   } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) {
-      imm.vs = instr->salu().imm;
-      return imm;
+   } else if (imm.unpack(program->gfx_level, instr.get())) {
    } else if (instr->isVINTERP_INREG()) {
       imm.exp = instr->vinterp_inreg().wait_exp;
       if (imm.exp == 0x7)
          imm.exp = wait_imm::unset_counter;
-      return imm;
    } else {
-      unsigned max_lgkm_cnt = program->gfx_level >= GFX10 ? 62 : 14;
-      unsigned max_exp_cnt = 6;
-      unsigned max_vm_cnt = program->gfx_level >= GFX9 ? 62 : 14;
-      unsigned max_vs_cnt = 62;
-
+      /* If an instruction increases a counter, it waits for it to be below maximum first. */
       std::array<unsigned, wait_type_num> wait_info =
          get_wait_counter_info(program->gfx_level, instr);
-      imm.lgkm = wait_info[wait_type_lgkm] ? max_lgkm_cnt : wait_imm::unset_counter;
-      imm.exp = wait_info[wait_type_exp] ? max_exp_cnt : wait_imm::unset_counter;
-      imm.vm = wait_info[wait_type_vm] ? max_vm_cnt : wait_imm::unset_counter;
-      imm.vs = wait_info[wait_type_vs] ? max_vs_cnt : wait_imm::unset_counter;
+      wait_imm max = wait_imm::max(program->gfx_level);
+      for (unsigned i = 0; i < wait_type_num; i++) {
+         if (wait_info[i])
+            imm[i] = max[i] - 1;
+      }
    }
    return imm;
 }
diff --git a/src/amd/compiler/tests/test_insert_waitcnt.cpp b/src/amd/compiler/tests/test_insert_waitcnt.cpp
index e1a933a62d7..14c83198e53 100644
--- a/src/amd/compiler/tests/test_insert_waitcnt.cpp
+++ b/src/amd/compiler/tests/test_insert_waitcnt.cpp
@@ -90,7 +90,7 @@ BEGIN_TEST(insert_waitcnt.clause)
             Operand::zero());
    //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false);
-   //! s_waitcnt vmcnt(0) lgkmcnt(0)
+   //! s_waitcnt lgkmcnt(0) vmcnt(0)
    //! v1: %0:v[5] = buffer_load_dword %0:s[4-7], %0:v[4], 0
    bld.mubuf(aco_opcode::buffer_load_dword, def_v5, Operand(PhysReg(4), s4), op_v4, Operand::zero(),
              0, false);