diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index 78162a96d61..c2f9b680fa7 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -898,14 +898,14 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr& if (instr->isFlat() || instr->isDS()) mark_read_regs_exec(state, instr, ctx.sgprs_read_by_DS); } else if (instr->isSALU() || instr->isSMEM()) { - if (instr->opcode == aco_opcode::s_waitcnt) { - wait_imm imm(state.program->gfx_level, instr->salu().imm); + wait_imm imm; + if (imm.unpack(state.program->gfx_level, instr.get())) { if (imm.vm == 0) ctx.sgprs_read_by_VMEM.reset(); if (imm.lgkm == 0) ctx.sgprs_read_by_DS.reset(); - } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt && instr->salu().imm == 0) { - ctx.sgprs_read_by_VMEM_store.reset(); + if (imm.vs == 0) + ctx.sgprs_read_by_VMEM_store.reset(); } else if (vm_vsrc == 0) { ctx.sgprs_read_by_VMEM.reset(); ctx.sgprs_read_by_DS.reset(); @@ -981,15 +981,10 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr& bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero()); } } else if (instr->isSALU()) { - /* Reducing lgkmcnt count to 0 always mitigates the hazard. */ - if (instr->opcode == aco_opcode::s_waitcnt_lgkmcnt) { - const SALU_instruction& sopk = instr->salu(); - if (sopk.imm == 0 && sopk.operands[0].physReg() == sgpr_null) - ctx.sgprs_read_by_SMEM.reset(); - } else if (instr->opcode == aco_opcode::s_waitcnt) { - wait_imm imm(state.program->gfx_level, instr->salu().imm); - if (imm.lgkm == 0) - ctx.sgprs_read_by_SMEM.reset(); + wait_imm imm; + if (imm.unpack(state.program->gfx_level, instr.get()) && imm.lgkm == 0) { + /* Reducing lgkmcnt count to 0 always mitigates the hazard. */ + ctx.sgprs_read_by_SMEM.reset(); } else if (instr->format != Format::SOPP && instr->definitions.size()) { /* SALU can mitigate the hazard */ ctx.sgprs_read_by_SMEM.reset(); @@ -1515,18 +1510,18 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr& for (Operand& op : instr->operands) fill_vgpr_bitset(ctx.vgpr_used_by_ds, op.physReg(), op.bytes()); } + wait_imm imm; if (instr->isVALU() || instr->isEXP() || vm_vsrc == 0) { ctx.vgpr_used_by_vmem_load.reset(); ctx.vgpr_used_by_vmem_store.reset(); ctx.vgpr_used_by_ds.reset(); - } else if (instr->opcode == aco_opcode::s_waitcnt) { - wait_imm imm(GFX11, instr->salu().imm); + } else if (imm.unpack(state.program->gfx_level, instr.get())) { if (imm.vm == 0) ctx.vgpr_used_by_vmem_load.reset(); if (imm.lgkm == 0) ctx.vgpr_used_by_ds.reset(); - } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt && instr->salu().imm == 0) { - ctx.vgpr_used_by_vmem_store.reset(); + if (imm.vs == 0) + ctx.vgpr_used_by_vmem_store.reset(); } if (instr->isLDSDIR()) { if (ctx.vgpr_used_by_vmem_load[instr->definitions[0].physReg().reg() - 256] || diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index a40c7e357fd..b9d25db4f1c 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -228,16 +228,15 @@ struct wait_entry { }; struct target_info { - uint16_t max_cnt[wait_type_num] = {}; + wait_imm max_cnt; uint32_t events[wait_type_num] = {}; uint16_t unordered_events; target_info(enum amd_gfx_level gfx_level) { - max_cnt[wait_type_vm] = gfx_level >= GFX9 ? 62 : 14; - max_cnt[wait_type_exp] = 6; - max_cnt[wait_type_lgkm] = gfx_level >= GFX10 ? 62 : 14; - max_cnt[wait_type_vs] = gfx_level >= GFX10 ? 62 : 0; + max_cnt = wait_imm::max(gfx_level); + for (unsigned i = 0; i < wait_type_num; i++) + max_cnt[i] = max_cnt[i] ? max_cnt[i] - 1 : 0; events[wait_type_exp] = event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock | event_ldsdir; @@ -402,19 +401,6 @@ check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* i } } -bool -parse_wait_instr(wait_ctx& ctx, wait_imm& imm, Instruction* instr) -{ - if (instr->opcode == aco_opcode::s_waitcnt_vscnt && instr->operands[0].physReg() == sgpr_null) { - imm.vs = std::min(imm.vs, instr->salu().imm); - return true; - } else if (instr->opcode == aco_opcode::s_waitcnt) { - imm.combine(wait_imm(ctx.gfx_level, instr->salu().imm)); - return true; - } - return false; -} - bool parse_delay_alu(wait_ctx& ctx, alu_delay_info& delay, Instruction* instr) { @@ -962,7 +948,7 @@ handle_block(Program* program, Block& block, wait_ctx& ctx) for (size_t i = 0; i < block.instructions.size(); i++) { aco_ptr& instr = block.instructions[i]; - bool is_wait = parse_wait_instr(ctx, queued_imm, instr.get()); + bool is_wait = queued_imm.unpack(ctx.gfx_level, instr.get()); bool is_delay_alu = parse_delay_alu(ctx, queued_delay, instr.get()); memory_sync_info sync_info = get_sync_info(instr.get()); diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 30e424e71a9..2c131d39c82 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -1200,32 +1200,6 @@ wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) : exp(exp_), lgkm(lgkm_), vm(vm_), vs(vs_) {} -wait_imm::wait_imm(enum amd_gfx_level gfx_level, uint16_t packed) : vs(unset_counter) -{ - if (gfx_level >= GFX11) { - vm = (packed >> 10) & 0x3f; - lgkm = (packed >> 4) & 0x3f; - exp = packed & 0x7; - } else { - vm = packed & 0xf; - if (gfx_level >= GFX9) - vm |= (packed >> 10) & 0x30; - - exp = (packed >> 4) & 0x7; - - lgkm = (packed >> 8) & 0xf; - if (gfx_level >= GFX10) - lgkm |= (packed >> 8) & 0x30; - } - - if (vm == (gfx_level >= GFX9 ? 0x3f : 0xf)) - vm = wait_imm::unset_counter; - if (exp == 0x7) - exp = wait_imm::unset_counter; - if (lgkm == (gfx_level >= GFX10 ? 0x3f : 0xf)) - lgkm = wait_imm::unset_counter; -} - uint16_t wait_imm::pack(enum amd_gfx_level gfx_level) const { @@ -1257,6 +1231,68 @@ wait_imm::pack(enum amd_gfx_level gfx_level) const return imm; } +wait_imm +wait_imm::max(enum amd_gfx_level gfx_level) +{ + wait_imm imm; + imm.vm = gfx_level >= GFX9 ? 63 : 15; + imm.exp = 7; + imm.lgkm = gfx_level >= GFX10 ? 63 : 15; + imm.vs = gfx_level >= GFX10 ? 63 : 0; + return imm; +} + +bool +wait_imm::unpack(enum amd_gfx_level gfx_level, const Instruction* instr) +{ + if (!instr->isSALU() || (!instr->operands.empty() && instr->operands[0].physReg() != sgpr_null)) + return false; + + aco_opcode op = instr->opcode; + uint16_t packed = instr->salu().imm; + + if (op == aco_opcode::s_waitcnt_expcnt) { + exp = std::min(exp, packed); + } else if (op == aco_opcode::s_waitcnt_lgkmcnt) { + lgkm = std::min(lgkm, packed); + } else if (op == aco_opcode::s_waitcnt_vmcnt) { + vm = std::min(vm, packed); + } else if (op == aco_opcode::s_waitcnt_vscnt) { + vs = std::min(vs, packed); + } else if (op == aco_opcode::s_waitcnt) { + uint8_t vm2, lgkm2, exp2; + if (gfx_level >= GFX11) { + vm2 = (packed >> 10) & 0x3f; + lgkm2 = (packed >> 4) & 0x3f; + exp2 = packed & 0x7; + } else { + vm2 = packed & 0xf; + if (gfx_level >= GFX9) + vm2 |= (packed >> 10) & 0x30; + + exp2 = (packed >> 4) & 0x7; + + lgkm2 = (packed >> 8) & 0xf; + if (gfx_level >= GFX10) + lgkm2 |= (packed >> 8) & 0x30; + } + + if (vm2 == (gfx_level >= GFX9 ? 0x3f : 0xf)) + vm2 = wait_imm::unset_counter; + if (exp2 == 0x7) + exp2 = wait_imm::unset_counter; + if (lgkm2 == (gfx_level >= GFX10 ? 0x3f : 0xf)) + lgkm2 = wait_imm::unset_counter; + + vm = std::min(vm, vm2); + exp = std::min(exp, exp2); + lgkm = std::min(lgkm, lgkm2); + } else { + return false; + } + return true; +} + bool wait_imm::combine(const wait_imm& other) { diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 8746e80e077..2ced8b25b37 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -182,6 +182,8 @@ enum wait_type { wait_type_num = 4, }; +struct Instruction; + struct wait_imm { static const uint8_t unset_counter = 0xff; @@ -192,10 +194,13 @@ struct wait_imm { wait_imm(); wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_); - wait_imm(enum amd_gfx_level chip, uint16_t packed); uint16_t pack(enum amd_gfx_level chip) const; + static wait_imm max(enum amd_gfx_level gfx_level); + + bool unpack(enum amd_gfx_level gfx_level, const Instruction* instr); + bool combine(const wait_imm& other); bool empty() const; diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index 66bb079b2cf..3fe100d8e8d 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -275,13 +275,17 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins uint16_t imm = instr->salu().imm; switch (instr->opcode) { case aco_opcode::s_waitcnt: { - wait_imm unpacked(gfx_level, imm); - if (unpacked.vm != wait_imm::unset_counter) - fprintf(output, " vmcnt(%d)", unpacked.vm); - if (unpacked.exp != wait_imm::unset_counter) - fprintf(output, " expcnt(%d)", unpacked.exp); - if (unpacked.lgkm != wait_imm::unset_counter) - fprintf(output, " lgkmcnt(%d)", unpacked.lgkm); + wait_imm unpacked; + unpacked.unpack(gfx_level, instr); + const char* names[wait_type_num]; + names[wait_type_exp] = "expcnt"; + names[wait_type_vm] = "vmcnt"; + names[wait_type_lgkm] = "lgkmcnt"; + names[wait_type_vs] = "vscnt"; + for (unsigned i = 0; i < wait_type_num; i++) { + if (unpacked[i] != wait_imm::unset_counter) + fprintf(output, " %s(%d)", names[i], unpacked[i]); + } break; } case aco_opcode::s_waitcnt_depctr: { diff --git a/src/amd/compiler/aco_statistics.cpp b/src/amd/compiler/aco_statistics.cpp index d9ba0b8c25a..65777f8fb6b 100644 --- a/src/amd/compiler/aco_statistics.cpp +++ b/src/amd/compiler/aco_statistics.cpp @@ -318,28 +318,20 @@ get_wait_imm(Program* program, aco_ptr& instr) if (instr->opcode == aco_opcode::s_endpgm) { for (unsigned i = 0; i < wait_type_num; i++) imm[i] = 0; - } else if (instr->opcode == aco_opcode::s_waitcnt) { - return wait_imm(program->gfx_level, instr->salu().imm); - } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) { - imm.vs = instr->salu().imm; - return imm; + } else if (imm.unpack(program->gfx_level, instr.get())) { } else if (instr->isVINTERP_INREG()) { imm.exp = instr->vinterp_inreg().wait_exp; if (imm.exp == 0x7) imm.exp = wait_imm::unset_counter; - return imm; } else { - unsigned max_lgkm_cnt = program->gfx_level >= GFX10 ? 62 : 14; - unsigned max_exp_cnt = 6; - unsigned max_vm_cnt = program->gfx_level >= GFX9 ? 62 : 14; - unsigned max_vs_cnt = 62; - + /* If an instruction increases a counter, it waits for it to be below maximum first. */ std::array wait_info = get_wait_counter_info(program->gfx_level, instr); - imm.lgkm = wait_info[wait_type_lgkm] ? max_lgkm_cnt : wait_imm::unset_counter; - imm.exp = wait_info[wait_type_exp] ? max_exp_cnt : wait_imm::unset_counter; - imm.vm = wait_info[wait_type_vm] ? max_vm_cnt : wait_imm::unset_counter; - imm.vs = wait_info[wait_type_vs] ? max_vs_cnt : wait_imm::unset_counter; + wait_imm max = wait_imm::max(program->gfx_level); + for (unsigned i = 0; i < wait_type_num; i++) { + if (wait_info[i]) + imm[i] = max[i] - 1; + } } return imm; } diff --git a/src/amd/compiler/tests/test_insert_waitcnt.cpp b/src/amd/compiler/tests/test_insert_waitcnt.cpp index e1a933a62d7..14c83198e53 100644 --- a/src/amd/compiler/tests/test_insert_waitcnt.cpp +++ b/src/amd/compiler/tests/test_insert_waitcnt.cpp @@ -90,7 +90,7 @@ BEGIN_TEST(insert_waitcnt.clause) Operand::zero()); //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0 bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false); - //! s_waitcnt vmcnt(0) lgkmcnt(0) + //! s_waitcnt lgkmcnt(0) vmcnt(0) //! v1: %0:v[5] = buffer_load_dword %0:s[4-7], %0:v[4], 0 bld.mubuf(aco_opcode::buffer_load_dword, def_v5, Operand(PhysReg(4), s4), op_v4, Operand::zero(), 0, false);