diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 51f02d47549..ac780821006 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -60,12 +60,12 @@ enum wait_event : uint16_t { }; enum counter_type : uint8_t { - counter_exp = 1 << 0, - counter_lgkm = 1 << 1, - counter_vm = 1 << 2, - counter_vs = 1 << 3, - counter_alu = 1 << 4, - num_counters = 5, + counter_exp = 1 << wait_type_exp, + counter_lgkm = 1 << wait_type_lgkm, + counter_vm = 1 << wait_type_vm, + counter_vs = 1 << wait_type_vs, + counter_alu = 1 << wait_type_num, + num_counters = wait_type_num + 1, }; enum vmem_type : uint8_t { @@ -74,12 +74,6 @@ enum vmem_type : uint8_t { vmem_bvh = 1 << 2, }; -static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | - event_gds_gpr_lock | event_vmem_gpr_lock | event_ldsdir; -static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg; -static const uint16_t vm_events = event_vmem | event_flat; -static const uint16_t vs_events = event_vmem_store; - /* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different * wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal * that we should switch to a different wave and contains info on dependencies as to @@ -164,30 +158,6 @@ struct alu_delay_info { } }; -uint8_t -get_counters_for_event(wait_event ev) -{ - switch (ev) { - case event_smem: - case event_lds: - case event_gds: - case event_sendmsg: return counter_lgkm; - case event_vmem: return counter_vm; - case event_vmem_store: return counter_vs; - case event_flat: return counter_vm | counter_lgkm; - case event_exp_pos: - case event_exp_param: - case event_exp_mrt_null: - case event_gds_gpr_lock: - case event_vmem_gpr_lock: - case event_ldsdir: return counter_exp; - case event_valu: - case event_trans: - case event_salu: return counter_alu; - default: return 0; - } -} - struct wait_entry { wait_imm imm; alu_delay_info delay; @@ -197,10 +167,10 @@ struct wait_entry { bool logical : 1; uint8_t vmem_types : 4; - wait_entry(wait_event event_, wait_imm imm_, alu_delay_info delay_, bool logical_, - bool wait_on_read_) - : imm(imm_), delay(delay_), events(event_), counters(get_counters_for_event(event_)), - wait_on_read(wait_on_read_), logical(logical_), vmem_types(0) + wait_entry(wait_event event_, wait_imm imm_, alu_delay_info delay_, uint8_t counters_, + bool logical_, bool wait_on_read_) + : imm(imm_), delay(delay_), events(event_), counters(counters_), wait_on_read(wait_on_read_), + logical(logical_), vmem_types(0) {} bool join(const wait_entry& other) @@ -218,38 +188,24 @@ struct wait_entry { return changed; } - void remove_counter(counter_type counter) + void remove_alu_counter() { - counters &= ~counter; + counters &= ~counter_alu; + delay = alu_delay_info(); + events &= ~(event_valu | event_trans | event_salu); + } - if (counter == counter_lgkm) { - imm.lgkm = wait_imm::unset_counter; - events &= ~(event_smem | event_lds | event_gds | event_sendmsg); - } - - if (counter == counter_vm) { - imm.vm = wait_imm::unset_counter; - events &= ~event_vmem; - vmem_types = 0; - } - - if (counter == counter_exp) { - imm.exp = wait_imm::unset_counter; - events &= ~exp_events; - } - - if (counter == counter_vs) { - imm.vs = wait_imm::unset_counter; - events &= ~event_vmem_store; - } + void remove_wait(wait_type type, uint32_t type_events) + { + counters &= ~(1 << type); + imm[type] = wait_imm::unset_counter; + events &= ~type_events | event_flat; if (!(counters & counter_lgkm) && !(counters & counter_vm)) - events &= ~event_flat; + events &= ~(type_events & event_flat); - if (counter == counter_alu) { - delay = alu_delay_info(); - events &= ~(event_valu | event_trans | event_salu); - } + if (type == wait_type_vm) + vmem_types = 0; } UNUSED void print(FILE* output) const @@ -270,14 +226,46 @@ struct wait_entry { } }; +struct target_info { + uint16_t max_cnt[wait_type_num] = {}; + uint32_t events[wait_type_num] = {}; + uint16_t unordered_events; + + target_info(enum amd_gfx_level gfx_level) + { + max_cnt[wait_type_vm] = gfx_level >= GFX9 ? 62 : 14; + max_cnt[wait_type_exp] = 6; + max_cnt[wait_type_lgkm] = gfx_level >= GFX10 ? 62 : 14; + max_cnt[wait_type_vs] = gfx_level >= GFX10 ? 62 : 0; + + events[wait_type_exp] = event_exp_pos | event_exp_param | event_exp_mrt_null | + event_gds_gpr_lock | event_vmem_gpr_lock | event_ldsdir; + events[wait_type_lgkm] = event_smem | event_lds | event_gds | event_flat | event_sendmsg; + events[wait_type_vm] = event_vmem | event_flat; + events[wait_type_vs] = event_vmem_store; + + for (unsigned i = 0; i < wait_type_num; i++) { + u_foreach_bit (j, events[i]) + counters[j] |= (1 << i); + } + counters[ffs(event_valu) - 1] |= counter_alu; + counters[ffs(event_trans) - 1] |= counter_alu; + counters[ffs(event_salu) - 1] |= counter_alu; + + unordered_events = event_smem | (gfx_level < GFX10 ? event_flat : 0); + } + + uint8_t get_counters_for_event(uint16_t event) const { return counters[ffs(event) - 1]; } + +private: + /* Bitfields of counters affected by each event */ + uint8_t counters[num_events] = {}; +}; + struct wait_ctx { Program* program; enum amd_gfx_level gfx_level; - uint16_t max_vm_cnt; - uint16_t max_exp_cnt; - uint16_t max_lgkm_cnt; - uint16_t max_vs_cnt; - uint16_t unordered_events = event_smem | event_flat; + const target_info* info; bool vm_nonzero = false; bool exp_nonzero = false; @@ -293,12 +281,8 @@ struct wait_ctx { std::map gpr_map; wait_ctx() {} - wait_ctx(Program* program_) - : program(program_), gfx_level(program_->gfx_level), - max_vm_cnt(program_->gfx_level >= GFX9 ? 62 : 14), max_exp_cnt(6), - max_lgkm_cnt(program_->gfx_level >= GFX10 ? 62 : 14), - max_vs_cnt(program_->gfx_level >= GFX10 ? 62 : 0), - unordered_events(event_smem | (program_->gfx_level < GFX10 ? event_flat : 0)) + wait_ctx(Program* program_, const target_info* info_) + : program(program_), gfx_level(program_->gfx_level), info(info_) {} bool join(const wait_ctx* other, bool logical) @@ -338,11 +322,6 @@ struct wait_ctx { return changed; } - void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter) - { - entry.remove_counter(counter); - } - UNUSED void print(FILE* output) const { fprintf(output, "exp_nonzero: %u\n", exp_nonzero); @@ -415,13 +394,16 @@ check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* i /* Vector Memory reads and writes return in the order they were issued */ uint8_t vmem_type = get_vmem_type(instr); - if (vmem_type && ((it->second.events & vm_events) == event_vmem) && - it->second.vmem_types == vmem_type) - reg_imm.vm = wait_imm::unset_counter; + if (vmem_type) { + wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event_vmem)) - 1); + if ((it->second.events & ctx.info->events[type]) == event_vmem && + (type != wait_type_vm || it->second.vmem_types == vmem_type)) + reg_imm[type] = wait_imm::unset_counter; + } /* LDS reads and writes return in the order they were issued. same for GDS */ - if (instr->isDS() && - (it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds)) + if (instr->isDS() && (it->second.events & ctx.info->events[wait_type_lgkm]) == + (instr->ds().gds ? event_gds : event_lds)) reg_imm.lgkm = wait_imm::unset_counter; wait.combine(reg_imm); @@ -517,7 +499,7 @@ update_alu(wait_ctx& ctx, bool is_valu, bool is_trans, bool clear, int cycles) wait_entry& entry = it->second; if (clear) { - entry.remove_counter(counter_alu); + entry.remove_alu_counter(); } else { entry.delay.valu_instrs += is_valu ? 1 : 0; entry.delay.trans_instrs += is_trans ? 1 : 0; @@ -527,7 +509,7 @@ update_alu(wait_ctx& ctx, bool is_valu, bool is_trans, bool clear, int cycles) entry.delay.fixup(); if (it->second.delay.empty()) - entry.remove_counter(counter_alu); + entry.remove_alu_counter(); } if (!entry.counters) @@ -619,19 +601,19 @@ kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx, uint16_t& bar_ev = ctx.barrier_events[i]; if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) { bar.exp = wait_imm::unset_counter; - bar_ev &= ~exp_events; + bar_ev &= ~ctx.info->events[wait_type_exp]; } if (bar.vm != wait_imm::unset_counter && imm.vm <= bar.vm) { bar.vm = wait_imm::unset_counter; - bar_ev &= ~(vm_events & ~event_flat); + bar_ev &= ~(ctx.info->events[wait_type_vm] & ~event_flat); } if (bar.lgkm != wait_imm::unset_counter && imm.lgkm <= bar.lgkm) { bar.lgkm = wait_imm::unset_counter; - bar_ev &= ~(lgkm_events & ~event_flat); + bar_ev &= ~(ctx.info->events[wait_type_lgkm] & ~event_flat); } if (bar.vs != wait_imm::unset_counter && imm.vs <= bar.vs) { bar.vs = wait_imm::unset_counter; - bar_ev &= ~vs_events; + bar_ev &= ~ctx.info->events[wait_type_vs]; } if (bar.vm == wait_imm::unset_counter && bar.lgkm == wait_imm::unset_counter) bar_ev &= ~event_flat; @@ -646,20 +628,20 @@ kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx, std::map::iterator it = ctx.gpr_map.begin(); while (it != ctx.gpr_map.end()) { if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp) - ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp); + it->second.remove_wait(wait_type_exp, ctx.info->events[wait_type_exp]); if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm) - ctx.wait_and_remove_from_entry(it->first, it->second, counter_vm); + it->second.remove_wait(wait_type_vm, ctx.info->events[wait_type_vm]); if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm) - ctx.wait_and_remove_from_entry(it->first, it->second, counter_lgkm); + it->second.remove_wait(wait_type_lgkm, ctx.info->events[wait_type_lgkm]); if (imm.vs != wait_imm::unset_counter && imm.vs <= it->second.imm.vs) - ctx.wait_and_remove_from_entry(it->first, it->second, counter_vs); + it->second.remove_wait(wait_type_vs, ctx.info->events[wait_type_vs]); if (delay.valu_instrs <= it->second.delay.valu_instrs) it->second.delay.valu_instrs = alu_delay_info::valu_nop; if (delay.trans_instrs <= it->second.delay.trans_instrs) it->second.delay.trans_instrs = alu_delay_info::trans_nop; it->second.delay.fixup(); if (it->second.delay.empty()) - ctx.wait_and_remove_from_entry(it->first, it->second, counter_alu); + it->second.remove_alu_counter(); if (!it->second.counters) it = ctx.gpr_map.erase(it); else @@ -698,15 +680,15 @@ update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_syn bar.exp = 0; if (counters & counter_vs) bar.vs = 0; - } else if (!(bar_ev & ctx.unordered_events) && !(ctx.unordered_events & event)) { - if (counters & counter_lgkm && (bar_ev & lgkm_events) == event) - update_barrier_counter(&bar.lgkm, ctx.max_lgkm_cnt); - if (counters & counter_vm && (bar_ev & vm_events) == event) - update_barrier_counter(&bar.vm, ctx.max_vm_cnt); - if (counters & counter_exp && (bar_ev & exp_events) == event) - update_barrier_counter(&bar.exp, ctx.max_exp_cnt); - if (counters & counter_vs && (bar_ev & vs_events) == event) - update_barrier_counter(&bar.vs, ctx.max_vs_cnt); + } else if (!(bar_ev & ctx.info->unordered_events) && !(ctx.info->unordered_events & event)) { + if (counters & counter_lgkm && (bar_ev & ctx.info->events[wait_type_lgkm]) == event) + update_barrier_counter(&bar.lgkm, ctx.info->max_cnt[wait_type_lgkm]); + if (counters & counter_vm && (bar_ev & ctx.info->events[wait_type_vm]) == event) + update_barrier_counter(&bar.vm, ctx.info->max_cnt[wait_type_vm]); + if (counters & counter_exp && (bar_ev & ctx.info->events[wait_type_exp]) == event) + update_barrier_counter(&bar.exp, ctx.info->max_cnt[wait_type_exp]); + if (counters & counter_vs && (bar_ev & ctx.info->events[wait_type_vs]) == event) + update_barrier_counter(&bar.vs, ctx.info->max_cnt[wait_type_vs]); } } } @@ -714,7 +696,7 @@ update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_syn void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_sync_info()) { - uint8_t counters = get_counters_for_event(event); + uint8_t counters = ctx.info->get_counters_for_event(event); if (counters & counter_lgkm) ctx.lgkm_nonzero = true; @@ -727,7 +709,7 @@ update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_ update_barrier_imm(ctx, counters, event, sync); - if (ctx.unordered_events & event) + if (ctx.info->unordered_events & event) return; if (ctx.pending_flat_lgkm) @@ -738,22 +720,22 @@ update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_ for (std::pair& e : ctx.gpr_map) { wait_entry& entry = e.second; - if (entry.events & ctx.unordered_events) + if (entry.events & ctx.info->unordered_events) continue; assert(entry.events); - if ((counters & counter_exp) && (entry.events & exp_events) == event && - entry.imm.exp < ctx.max_exp_cnt) + if ((counters & counter_exp) && (entry.events & ctx.info->events[wait_type_exp]) == event && + entry.imm.exp < ctx.info->max_cnt[wait_type_exp]) entry.imm.exp++; - if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event && - entry.imm.lgkm < ctx.max_lgkm_cnt) + if ((counters & counter_lgkm) && (entry.events & ctx.info->events[wait_type_lgkm]) == event && + entry.imm.lgkm < ctx.info->max_cnt[wait_type_lgkm]) entry.imm.lgkm++; - if ((counters & counter_vm) && (entry.events & vm_events) == event && - entry.imm.vm < ctx.max_vm_cnt) + if ((counters & counter_vm) && (entry.events & ctx.info->events[wait_type_vm]) == event && + entry.imm.vm < ctx.info->max_cnt[wait_type_vm]) entry.imm.vm++; - if ((counters & counter_vs) && (entry.events & vs_events) == event && - entry.imm.vs < ctx.max_vs_cnt) + if ((counters & counter_vs) && (entry.events & ctx.info->events[wait_type_vs]) == event && + entry.imm.vs < ctx.info->max_cnt[wait_type_vs]) entry.imm.vs++; } } @@ -782,7 +764,7 @@ void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read, uint8_t vmem_types = 0, unsigned cycles = 0, bool force_linear = false) { - uint16_t counters = get_counters_for_event(event); + uint16_t counters = ctx.info->get_counters_for_event(event); wait_imm imm; if (counters & counter_lgkm) imm.lgkm = 0; @@ -804,7 +786,8 @@ insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, boo delay.salu_cycles = cycles; } - wait_entry new_entry(event, imm, delay, !rc.is_linear() && !force_linear, wait_on_read); + wait_entry new_entry(event, imm, delay, counters, !rc.is_linear() && !force_linear, + wait_on_read); new_entry.vmem_types |= vmem_types; for (unsigned i = 0; i < rc.size(); i++) { @@ -1126,16 +1109,18 @@ handle_block(Program* program, Block& block, wait_ctx& ctx) void insert_wait_states(Program* program) { + target_info info(program->gfx_level); + /* per BB ctx */ std::vector done(program->blocks.size()); - std::vector in_ctx(program->blocks.size(), wait_ctx(program)); - std::vector out_ctx(program->blocks.size(), wait_ctx(program)); + std::vector in_ctx(program->blocks.size(), wait_ctx(program, &info)); + std::vector out_ctx(program->blocks.size(), wait_ctx(program, &info)); std::stack> loop_header_indices; unsigned loop_progress = 0; if (program->pending_lds_access) { - update_barrier_imm(in_ctx[0], get_counters_for_event(event_lds), event_lds, + update_barrier_imm(in_ctx[0], info.get_counters_for_event(event_lds), event_lds, memory_sync_info(storage_shared)); }