aco/waitcnt: add target_info

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28981>
This commit is contained in:
Rhys Perry
2024-05-03 11:19:55 +01:00
committed by Marge Bot
parent 20b4e30e25
commit ff2e3ef5eb

View File

@@ -60,12 +60,12 @@ enum wait_event : uint16_t {
};
enum counter_type : uint8_t {
counter_exp = 1 << 0,
counter_lgkm = 1 << 1,
counter_vm = 1 << 2,
counter_vs = 1 << 3,
counter_alu = 1 << 4,
num_counters = 5,
counter_exp = 1 << wait_type_exp,
counter_lgkm = 1 << wait_type_lgkm,
counter_vm = 1 << wait_type_vm,
counter_vs = 1 << wait_type_vs,
counter_alu = 1 << wait_type_num,
num_counters = wait_type_num + 1,
};
enum vmem_type : uint8_t {
@@ -74,12 +74,6 @@ enum vmem_type : uint8_t {
vmem_bvh = 1 << 2,
};
static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null |
event_gds_gpr_lock | event_vmem_gpr_lock | event_ldsdir;
static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg;
static const uint16_t vm_events = event_vmem | event_flat;
static const uint16_t vs_events = event_vmem_store;
/* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different
* wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal
* that we should switch to a different wave and contains info on dependencies as to
@@ -164,30 +158,6 @@ struct alu_delay_info {
}
};
uint8_t
get_counters_for_event(wait_event ev)
{
switch (ev) {
case event_smem:
case event_lds:
case event_gds:
case event_sendmsg: return counter_lgkm;
case event_vmem: return counter_vm;
case event_vmem_store: return counter_vs;
case event_flat: return counter_vm | counter_lgkm;
case event_exp_pos:
case event_exp_param:
case event_exp_mrt_null:
case event_gds_gpr_lock:
case event_vmem_gpr_lock:
case event_ldsdir: return counter_exp;
case event_valu:
case event_trans:
case event_salu: return counter_alu;
default: return 0;
}
}
struct wait_entry {
wait_imm imm;
alu_delay_info delay;
@@ -197,10 +167,10 @@ struct wait_entry {
bool logical : 1;
uint8_t vmem_types : 4;
wait_entry(wait_event event_, wait_imm imm_, alu_delay_info delay_, bool logical_,
bool wait_on_read_)
: imm(imm_), delay(delay_), events(event_), counters(get_counters_for_event(event_)),
wait_on_read(wait_on_read_), logical(logical_), vmem_types(0)
wait_entry(wait_event event_, wait_imm imm_, alu_delay_info delay_, uint8_t counters_,
bool logical_, bool wait_on_read_)
: imm(imm_), delay(delay_), events(event_), counters(counters_), wait_on_read(wait_on_read_),
logical(logical_), vmem_types(0)
{}
bool join(const wait_entry& other)
@@ -218,38 +188,24 @@ struct wait_entry {
return changed;
}
void remove_counter(counter_type counter)
void remove_alu_counter()
{
counters &= ~counter;
counters &= ~counter_alu;
delay = alu_delay_info();
events &= ~(event_valu | event_trans | event_salu);
}
if (counter == counter_lgkm) {
imm.lgkm = wait_imm::unset_counter;
events &= ~(event_smem | event_lds | event_gds | event_sendmsg);
}
if (counter == counter_vm) {
imm.vm = wait_imm::unset_counter;
events &= ~event_vmem;
vmem_types = 0;
}
if (counter == counter_exp) {
imm.exp = wait_imm::unset_counter;
events &= ~exp_events;
}
if (counter == counter_vs) {
imm.vs = wait_imm::unset_counter;
events &= ~event_vmem_store;
}
void remove_wait(wait_type type, uint32_t type_events)
{
counters &= ~(1 << type);
imm[type] = wait_imm::unset_counter;
events &= ~type_events | event_flat;
if (!(counters & counter_lgkm) && !(counters & counter_vm))
events &= ~event_flat;
events &= ~(type_events & event_flat);
if (counter == counter_alu) {
delay = alu_delay_info();
events &= ~(event_valu | event_trans | event_salu);
}
if (type == wait_type_vm)
vmem_types = 0;
}
UNUSED void print(FILE* output) const
@@ -270,14 +226,46 @@ struct wait_entry {
}
};
struct target_info {
uint16_t max_cnt[wait_type_num] = {};
uint32_t events[wait_type_num] = {};
uint16_t unordered_events;
target_info(enum amd_gfx_level gfx_level)
{
max_cnt[wait_type_vm] = gfx_level >= GFX9 ? 62 : 14;
max_cnt[wait_type_exp] = 6;
max_cnt[wait_type_lgkm] = gfx_level >= GFX10 ? 62 : 14;
max_cnt[wait_type_vs] = gfx_level >= GFX10 ? 62 : 0;
events[wait_type_exp] = event_exp_pos | event_exp_param | event_exp_mrt_null |
event_gds_gpr_lock | event_vmem_gpr_lock | event_ldsdir;
events[wait_type_lgkm] = event_smem | event_lds | event_gds | event_flat | event_sendmsg;
events[wait_type_vm] = event_vmem | event_flat;
events[wait_type_vs] = event_vmem_store;
for (unsigned i = 0; i < wait_type_num; i++) {
u_foreach_bit (j, events[i])
counters[j] |= (1 << i);
}
counters[ffs(event_valu) - 1] |= counter_alu;
counters[ffs(event_trans) - 1] |= counter_alu;
counters[ffs(event_salu) - 1] |= counter_alu;
unordered_events = event_smem | (gfx_level < GFX10 ? event_flat : 0);
}
uint8_t get_counters_for_event(uint16_t event) const { return counters[ffs(event) - 1]; }
private:
/* Bitfields of counters affected by each event */
uint8_t counters[num_events] = {};
};
struct wait_ctx {
Program* program;
enum amd_gfx_level gfx_level;
uint16_t max_vm_cnt;
uint16_t max_exp_cnt;
uint16_t max_lgkm_cnt;
uint16_t max_vs_cnt;
uint16_t unordered_events = event_smem | event_flat;
const target_info* info;
bool vm_nonzero = false;
bool exp_nonzero = false;
@@ -293,12 +281,8 @@ struct wait_ctx {
std::map<PhysReg, wait_entry> gpr_map;
wait_ctx() {}
wait_ctx(Program* program_)
: program(program_), gfx_level(program_->gfx_level),
max_vm_cnt(program_->gfx_level >= GFX9 ? 62 : 14), max_exp_cnt(6),
max_lgkm_cnt(program_->gfx_level >= GFX10 ? 62 : 14),
max_vs_cnt(program_->gfx_level >= GFX10 ? 62 : 0),
unordered_events(event_smem | (program_->gfx_level < GFX10 ? event_flat : 0))
wait_ctx(Program* program_, const target_info* info_)
: program(program_), gfx_level(program_->gfx_level), info(info_)
{}
bool join(const wait_ctx* other, bool logical)
@@ -338,11 +322,6 @@ struct wait_ctx {
return changed;
}
void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter)
{
entry.remove_counter(counter);
}
UNUSED void print(FILE* output) const
{
fprintf(output, "exp_nonzero: %u\n", exp_nonzero);
@@ -415,13 +394,16 @@ check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* i
/* Vector Memory reads and writes return in the order they were issued */
uint8_t vmem_type = get_vmem_type(instr);
if (vmem_type && ((it->second.events & vm_events) == event_vmem) &&
it->second.vmem_types == vmem_type)
reg_imm.vm = wait_imm::unset_counter;
if (vmem_type) {
wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event_vmem)) - 1);
if ((it->second.events & ctx.info->events[type]) == event_vmem &&
(type != wait_type_vm || it->second.vmem_types == vmem_type))
reg_imm[type] = wait_imm::unset_counter;
}
/* LDS reads and writes return in the order they were issued. same for GDS */
if (instr->isDS() &&
(it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds))
if (instr->isDS() && (it->second.events & ctx.info->events[wait_type_lgkm]) ==
(instr->ds().gds ? event_gds : event_lds))
reg_imm.lgkm = wait_imm::unset_counter;
wait.combine(reg_imm);
@@ -517,7 +499,7 @@ update_alu(wait_ctx& ctx, bool is_valu, bool is_trans, bool clear, int cycles)
wait_entry& entry = it->second;
if (clear) {
entry.remove_counter(counter_alu);
entry.remove_alu_counter();
} else {
entry.delay.valu_instrs += is_valu ? 1 : 0;
entry.delay.trans_instrs += is_trans ? 1 : 0;
@@ -527,7 +509,7 @@ update_alu(wait_ctx& ctx, bool is_valu, bool is_trans, bool clear, int cycles)
entry.delay.fixup();
if (it->second.delay.empty())
entry.remove_counter(counter_alu);
entry.remove_alu_counter();
}
if (!entry.counters)
@@ -619,19 +601,19 @@ kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx,
uint16_t& bar_ev = ctx.barrier_events[i];
if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) {
bar.exp = wait_imm::unset_counter;
bar_ev &= ~exp_events;
bar_ev &= ~ctx.info->events[wait_type_exp];
}
if (bar.vm != wait_imm::unset_counter && imm.vm <= bar.vm) {
bar.vm = wait_imm::unset_counter;
bar_ev &= ~(vm_events & ~event_flat);
bar_ev &= ~(ctx.info->events[wait_type_vm] & ~event_flat);
}
if (bar.lgkm != wait_imm::unset_counter && imm.lgkm <= bar.lgkm) {
bar.lgkm = wait_imm::unset_counter;
bar_ev &= ~(lgkm_events & ~event_flat);
bar_ev &= ~(ctx.info->events[wait_type_lgkm] & ~event_flat);
}
if (bar.vs != wait_imm::unset_counter && imm.vs <= bar.vs) {
bar.vs = wait_imm::unset_counter;
bar_ev &= ~vs_events;
bar_ev &= ~ctx.info->events[wait_type_vs];
}
if (bar.vm == wait_imm::unset_counter && bar.lgkm == wait_imm::unset_counter)
bar_ev &= ~event_flat;
@@ -646,20 +628,20 @@ kill(wait_imm& imm, alu_delay_info& delay, Instruction* instr, wait_ctx& ctx,
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin();
while (it != ctx.gpr_map.end()) {
if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp)
ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp);
it->second.remove_wait(wait_type_exp, ctx.info->events[wait_type_exp]);
if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm)
ctx.wait_and_remove_from_entry(it->first, it->second, counter_vm);
it->second.remove_wait(wait_type_vm, ctx.info->events[wait_type_vm]);
if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm)
ctx.wait_and_remove_from_entry(it->first, it->second, counter_lgkm);
it->second.remove_wait(wait_type_lgkm, ctx.info->events[wait_type_lgkm]);
if (imm.vs != wait_imm::unset_counter && imm.vs <= it->second.imm.vs)
ctx.wait_and_remove_from_entry(it->first, it->second, counter_vs);
it->second.remove_wait(wait_type_vs, ctx.info->events[wait_type_vs]);
if (delay.valu_instrs <= it->second.delay.valu_instrs)
it->second.delay.valu_instrs = alu_delay_info::valu_nop;
if (delay.trans_instrs <= it->second.delay.trans_instrs)
it->second.delay.trans_instrs = alu_delay_info::trans_nop;
it->second.delay.fixup();
if (it->second.delay.empty())
ctx.wait_and_remove_from_entry(it->first, it->second, counter_alu);
it->second.remove_alu_counter();
if (!it->second.counters)
it = ctx.gpr_map.erase(it);
else
@@ -698,15 +680,15 @@ update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_syn
bar.exp = 0;
if (counters & counter_vs)
bar.vs = 0;
} else if (!(bar_ev & ctx.unordered_events) && !(ctx.unordered_events & event)) {
if (counters & counter_lgkm && (bar_ev & lgkm_events) == event)
update_barrier_counter(&bar.lgkm, ctx.max_lgkm_cnt);
if (counters & counter_vm && (bar_ev & vm_events) == event)
update_barrier_counter(&bar.vm, ctx.max_vm_cnt);
if (counters & counter_exp && (bar_ev & exp_events) == event)
update_barrier_counter(&bar.exp, ctx.max_exp_cnt);
if (counters & counter_vs && (bar_ev & vs_events) == event)
update_barrier_counter(&bar.vs, ctx.max_vs_cnt);
} else if (!(bar_ev & ctx.info->unordered_events) && !(ctx.info->unordered_events & event)) {
if (counters & counter_lgkm && (bar_ev & ctx.info->events[wait_type_lgkm]) == event)
update_barrier_counter(&bar.lgkm, ctx.info->max_cnt[wait_type_lgkm]);
if (counters & counter_vm && (bar_ev & ctx.info->events[wait_type_vm]) == event)
update_barrier_counter(&bar.vm, ctx.info->max_cnt[wait_type_vm]);
if (counters & counter_exp && (bar_ev & ctx.info->events[wait_type_exp]) == event)
update_barrier_counter(&bar.exp, ctx.info->max_cnt[wait_type_exp]);
if (counters & counter_vs && (bar_ev & ctx.info->events[wait_type_vs]) == event)
update_barrier_counter(&bar.vs, ctx.info->max_cnt[wait_type_vs]);
}
}
}
@@ -714,7 +696,7 @@ update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_syn
void
update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_sync_info())
{
uint8_t counters = get_counters_for_event(event);
uint8_t counters = ctx.info->get_counters_for_event(event);
if (counters & counter_lgkm)
ctx.lgkm_nonzero = true;
@@ -727,7 +709,7 @@ update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_
update_barrier_imm(ctx, counters, event, sync);
if (ctx.unordered_events & event)
if (ctx.info->unordered_events & event)
return;
if (ctx.pending_flat_lgkm)
@@ -738,22 +720,22 @@ update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_
for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) {
wait_entry& entry = e.second;
if (entry.events & ctx.unordered_events)
if (entry.events & ctx.info->unordered_events)
continue;
assert(entry.events);
if ((counters & counter_exp) && (entry.events & exp_events) == event &&
entry.imm.exp < ctx.max_exp_cnt)
if ((counters & counter_exp) && (entry.events & ctx.info->events[wait_type_exp]) == event &&
entry.imm.exp < ctx.info->max_cnt[wait_type_exp])
entry.imm.exp++;
if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event &&
entry.imm.lgkm < ctx.max_lgkm_cnt)
if ((counters & counter_lgkm) && (entry.events & ctx.info->events[wait_type_lgkm]) == event &&
entry.imm.lgkm < ctx.info->max_cnt[wait_type_lgkm])
entry.imm.lgkm++;
if ((counters & counter_vm) && (entry.events & vm_events) == event &&
entry.imm.vm < ctx.max_vm_cnt)
if ((counters & counter_vm) && (entry.events & ctx.info->events[wait_type_vm]) == event &&
entry.imm.vm < ctx.info->max_cnt[wait_type_vm])
entry.imm.vm++;
if ((counters & counter_vs) && (entry.events & vs_events) == event &&
entry.imm.vs < ctx.max_vs_cnt)
if ((counters & counter_vs) && (entry.events & ctx.info->events[wait_type_vs]) == event &&
entry.imm.vs < ctx.info->max_cnt[wait_type_vs])
entry.imm.vs++;
}
}
@@ -782,7 +764,7 @@ void
insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
uint8_t vmem_types = 0, unsigned cycles = 0, bool force_linear = false)
{
uint16_t counters = get_counters_for_event(event);
uint16_t counters = ctx.info->get_counters_for_event(event);
wait_imm imm;
if (counters & counter_lgkm)
imm.lgkm = 0;
@@ -804,7 +786,8 @@ insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, boo
delay.salu_cycles = cycles;
}
wait_entry new_entry(event, imm, delay, !rc.is_linear() && !force_linear, wait_on_read);
wait_entry new_entry(event, imm, delay, counters, !rc.is_linear() && !force_linear,
wait_on_read);
new_entry.vmem_types |= vmem_types;
for (unsigned i = 0; i < rc.size(); i++) {
@@ -1126,16 +1109,18 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
void
insert_wait_states(Program* program)
{
target_info info(program->gfx_level);
/* per BB ctx */
std::vector<bool> done(program->blocks.size());
std::vector<wait_ctx> in_ctx(program->blocks.size(), wait_ctx(program));
std::vector<wait_ctx> out_ctx(program->blocks.size(), wait_ctx(program));
std::vector<wait_ctx> in_ctx(program->blocks.size(), wait_ctx(program, &info));
std::vector<wait_ctx> out_ctx(program->blocks.size(), wait_ctx(program, &info));
std::stack<unsigned, std::vector<unsigned>> loop_header_indices;
unsigned loop_progress = 0;
if (program->pending_lds_access) {
update_barrier_imm(in_ctx[0], get_counters_for_event(event_lds), event_lds,
update_barrier_imm(in_ctx[0], info.get_counters_for_event(event_lds), event_lds,
memory_sync_info(storage_shared));
}