aco/wave32: Use lane mask regclass for exec/vcc.
Currently all usages of exec and vcc are hardcoded to use s2 regclass. This commit makes it possible to use s1 in wave32 mode and s2 in wave64 mode. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
This commit is contained in:
committed by
Daniel Schürmann
parent
b4efe179ed
commit
e0bcefc3a0
@@ -302,14 +302,15 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
|
||||
return;
|
||||
if (ctx.info[idx].exec.back().second & mask_type_global) {
|
||||
Temp exec_mask = ctx.info[idx].exec.back().first;
|
||||
exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), exec_mask);
|
||||
exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), exec_mask);
|
||||
ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm);
|
||||
return;
|
||||
}
|
||||
/* otherwise, the WQM mask should be one below the current mask */
|
||||
ctx.info[idx].exec.pop_back();
|
||||
assert(ctx.info[idx].exec.back().second & mask_type_wqm);
|
||||
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
|
||||
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
|
||||
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
|
||||
ctx.info[idx].exec.back().first);
|
||||
}
|
||||
|
||||
@@ -324,14 +325,15 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
|
||||
!(ctx.info[idx].exec.back().second & mask_type_loop)) {
|
||||
ctx.info[idx].exec.pop_back();
|
||||
assert(ctx.info[idx].exec.back().second & mask_type_exact);
|
||||
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
|
||||
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
|
||||
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
|
||||
ctx.info[idx].exec.back().first);
|
||||
return;
|
||||
}
|
||||
/* otherwise, we create an exact mask and push to the stack */
|
||||
Temp wqm = ctx.info[idx].exec.back().first;
|
||||
Temp exact = bld.tmp(s2);
|
||||
wqm = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
|
||||
Temp exact = bld.tmp(bld.lm);
|
||||
wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
|
||||
bld.exec(Definition(exact)), ctx.info[idx].exec[0].first, bld.exec(wqm));
|
||||
ctx.info[idx].exec.back().first = wqm;
|
||||
ctx.info[idx].exec.emplace_back(exact, mask_type_exact);
|
||||
@@ -359,7 +361,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
||||
} else {
|
||||
uint8_t mask = mask_type_global;
|
||||
if (ctx.program->needs_wqm) {
|
||||
exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), bld.exec(exec_mask));
|
||||
exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), bld.exec(exec_mask));
|
||||
mask |= mask_type_wqm;
|
||||
} else {
|
||||
mask |= mask_type_exact;
|
||||
@@ -383,7 +385,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
||||
aco_ptr<Pseudo_instruction> phi;
|
||||
for (int i = 0; i < info.num_exec_masks - 1; i++) {
|
||||
phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1));
|
||||
phi->definitions[0] = bld.def(s2);
|
||||
phi->definitions[0] = bld.def(bld.lm);
|
||||
phi->operands[0] = Operand(ctx.info[preds[0]].exec[i].first);
|
||||
ctx.info[idx].exec[i].first = bld.insert(std::move(phi));
|
||||
}
|
||||
@@ -393,7 +395,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
||||
if (info.has_divergent_break) {
|
||||
/* this phi might be trivial but ensures a parallelcopy on the loop header */
|
||||
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
|
||||
phi->definitions[0] = bld.def(s2);
|
||||
phi->definitions[0] = bld.def(bld.lm);
|
||||
phi->operands[0] = Operand(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first);
|
||||
ctx.info[idx].exec.back().first = bld.insert(std::move(phi));
|
||||
}
|
||||
@@ -401,9 +403,9 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
||||
/* create ssa name for loop active mask */
|
||||
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
|
||||
if (info.has_divergent_continue)
|
||||
phi->definitions[0] = bld.def(s2);
|
||||
phi->definitions[0] = bld.def(bld.lm);
|
||||
else
|
||||
phi->definitions[0] = bld.def(s2, exec);
|
||||
phi->definitions[0] = bld.def(bld.lm, exec);
|
||||
phi->operands[0] = Operand(ctx.info[preds[0]].exec.back().first);
|
||||
Temp loop_active = bld.insert(std::move(phi));
|
||||
|
||||
@@ -423,7 +425,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
||||
i++;
|
||||
}
|
||||
uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
|
||||
ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
|
||||
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
|
||||
ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
|
||||
ctx.info[idx].exec.back().first), mask_type);
|
||||
}
|
||||
|
||||
@@ -480,7 +483,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
||||
} else {
|
||||
/* create phi for loop footer */
|
||||
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
|
||||
phi->definitions[0] = bld.def(s2);
|
||||
phi->definitions[0] = bld.def(bld.lm);
|
||||
for (unsigned i = 0; i < phi->operands.size(); i++)
|
||||
phi->operands[i] = Operand(ctx.info[preds[i]].exec[k].first);
|
||||
ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type);
|
||||
@@ -510,7 +513,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
||||
transition_to_Exact(ctx, bld, idx);
|
||||
}
|
||||
|
||||
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
|
||||
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
|
||||
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
|
||||
ctx.info[idx].exec.back().first);
|
||||
|
||||
ctx.loop.pop_back();
|
||||
@@ -536,7 +540,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
||||
continue;
|
||||
}
|
||||
|
||||
Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(s2, exec) : bld.def(s2),
|
||||
Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(bld.lm, exec) : bld.def(bld.lm),
|
||||
ctx.info[preds[0]].exec[i].first,
|
||||
ctx.info[preds[1]].exec[i].first);
|
||||
uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
|
||||
@@ -578,7 +582,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
||||
|
||||
if (block->kind & block_kind_merge) {
|
||||
Temp restore = ctx.info[idx].exec.back().first;
|
||||
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), restore);
|
||||
assert(restore.size() == bld.lm.size());
|
||||
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), restore);
|
||||
}
|
||||
|
||||
return i;
|
||||
@@ -589,7 +594,7 @@ void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptr<Instructi
|
||||
Operand offset = instr->operands[1];
|
||||
if (need_check) {
|
||||
/* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */
|
||||
Temp nonempty = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), cur_exec, Operand(0u));
|
||||
Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u));
|
||||
|
||||
if (offset.isLiteral())
|
||||
offset = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1), offset);
|
||||
@@ -665,7 +670,7 @@ void process_instructions(exec_ctx& ctx, Block* block,
|
||||
assert(num);
|
||||
Operand cond = instr->operands[0];
|
||||
for (int i = num - 1; i >= 0; i--) {
|
||||
Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
|
||||
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
|
||||
ctx.info[block->index].exec[i].first, cond);
|
||||
if (i == num - 1) {
|
||||
andn2->operands[0].setFixed(exec);
|
||||
@@ -689,8 +694,9 @@ void process_instructions(exec_ctx& ctx, Block* block,
|
||||
|
||||
if (instr->opcode == aco_opcode::p_is_helper || instr->opcode == aco_opcode::p_load_helper) {
|
||||
Definition dst = instr->definitions[0];
|
||||
assert(dst.size() == bld.lm.size());
|
||||
if (state == Exact) {
|
||||
instr.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b64, Format::SOP1, 1, 1));
|
||||
instr.reset(create_instruction<SOP1_instruction>(bld.w64or32(Builder::s_mov), Format::SOP1, 1, 1));
|
||||
instr->operands[0] = Operand(0u);
|
||||
instr->definitions[0] = dst;
|
||||
} else {
|
||||
@@ -710,7 +716,7 @@ void process_instructions(exec_ctx& ctx, Block* block,
|
||||
assert(instr->opcode == aco_opcode::p_is_helper || exact_mask.second & mask_type_initial);
|
||||
assert(exact_mask.second & mask_type_exact);
|
||||
|
||||
instr.reset(create_instruction<SOP2_instruction>(aco_opcode::s_andn2_b64, Format::SOP2, 2, 2));
|
||||
instr.reset(create_instruction<SOP2_instruction>(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2));
|
||||
instr->operands[0] = Operand(ctx.info[block->index].exec.back().first); /* current exec */
|
||||
instr->operands[1] = Operand(exact_mask.first);
|
||||
instr->definitions[0] = dst;
|
||||
@@ -726,8 +732,8 @@ void process_instructions(exec_ctx& ctx, Block* block,
|
||||
if (instr->operands.empty()) {
|
||||
/* transition to exact and set exec to zero */
|
||||
Temp old_exec = ctx.info[block->index].exec.back().first;
|
||||
Temp new_exec = bld.tmp(s2);
|
||||
cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
|
||||
Temp new_exec = bld.tmp(bld.lm);
|
||||
cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
|
||||
bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
|
||||
if (ctx.info[block->index].exec.back().second & mask_type_exact) {
|
||||
ctx.info[block->index].exec.back().first = new_exec;
|
||||
@@ -746,7 +752,7 @@ void process_instructions(exec_ctx& ctx, Block* block,
|
||||
num += ctx.info[block->index].exec.size() - 1;
|
||||
for (int i = num - 1; i >= 0; i--) {
|
||||
if (ctx.info[block->index].exec[i].second & mask_type_exact) {
|
||||
Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
|
||||
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
|
||||
ctx.info[block->index].exec[i].first, cond);
|
||||
if (i == num - 1) {
|
||||
andn2->operands[0].setFixed(exec);
|
||||
@@ -878,13 +884,13 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
||||
}
|
||||
|
||||
Temp old_exec = ctx.info[idx].exec.back().first;
|
||||
Temp new_exec = bld.tmp(s2);
|
||||
Temp cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
|
||||
Temp new_exec = bld.tmp(bld.lm);
|
||||
Temp cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
|
||||
bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
|
||||
ctx.info[idx].exec.back().first = new_exec;
|
||||
|
||||
for (int i = num - 1; i >= 0; i--) {
|
||||
Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
|
||||
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
|
||||
ctx.info[block->index].exec[i].first, cond);
|
||||
if (i == (int)ctx.info[idx].exec.size() - 1)
|
||||
andn2->definitions[0].setFixed(exec);
|
||||
@@ -912,7 +918,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
||||
Temp cond = Temp();
|
||||
for (int exec_idx = ctx.info[idx].exec.size() - 1; exec_idx >= 0; exec_idx--) {
|
||||
if (ctx.info[idx].exec[exec_idx].second & mask_type_loop) {
|
||||
cond = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), ctx.info[idx].exec[exec_idx].first, Operand(0u));
|
||||
cond = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), ctx.info[idx].exec[exec_idx].first, Operand(0u));
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -957,8 +963,8 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
||||
Temp current_exec = ctx.info[idx].exec.back().first;
|
||||
uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
|
||||
|
||||
Temp then_mask = bld.tmp(s2);
|
||||
Temp old_exec = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
|
||||
Temp then_mask = bld.tmp(bld.lm);
|
||||
Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
|
||||
bld.exec(Definition(then_mask)), cond, bld.exec(current_exec));
|
||||
|
||||
ctx.info[idx].exec.back().first = old_exec;
|
||||
@@ -978,7 +984,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
||||
uint8_t mask_type = ctx.info[idx].exec.back().second;
|
||||
ctx.info[idx].exec.pop_back();
|
||||
Temp orig_exec = ctx.info[idx].exec.back().first;
|
||||
Temp else_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2, exec),
|
||||
Temp else_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm, exec),
|
||||
bld.def(s1, scc), orig_exec, bld.exec(then_mask));
|
||||
|
||||
/* add next current exec to the stack */
|
||||
@@ -998,7 +1004,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
||||
for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
|
||||
cond = bld.tmp(s1);
|
||||
Temp exec_mask = ctx.info[idx].exec[exec_idx].first;
|
||||
exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)),
|
||||
exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
|
||||
exec_mask, current_exec);
|
||||
ctx.info[idx].exec[exec_idx].first = exec_mask;
|
||||
if (ctx.info[idx].exec[exec_idx].second & mask_type_loop)
|
||||
@@ -1010,7 +1016,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
||||
unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
|
||||
Block& succ = ctx.program->blocks[succ_idx];
|
||||
if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
|
||||
ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u));
|
||||
ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u));
|
||||
}
|
||||
|
||||
bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
|
||||
@@ -1028,7 +1034,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
||||
break;
|
||||
cond = bld.tmp(s1);
|
||||
Temp exec_mask = ctx.info[idx].exec[exec_idx].first;
|
||||
exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)),
|
||||
exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
|
||||
exec_mask, bld.exec(current_exec));
|
||||
ctx.info[idx].exec[exec_idx].first = exec_mask;
|
||||
}
|
||||
@@ -1039,7 +1045,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
||||
unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
|
||||
Block& succ = ctx.program->blocks[succ_idx];
|
||||
if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
|
||||
ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u));
|
||||
ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u));
|
||||
}
|
||||
|
||||
bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
|
||||
|
||||
Reference in New Issue
Block a user