aco/wave32: Use lane mask regclass for exec/vcc.

Currently all usages of exec and vcc are hardcoded to use s2 regclass.
This commit makes it possible to use s1 in wave32 mode and
s2 in wave64 mode.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
This commit is contained in:
Timur Kristóf
2019-11-27 11:04:47 +01:00
committed by Daniel Schürmann
parent b4efe179ed
commit e0bcefc3a0
12 changed files with 250 additions and 209 deletions

View File

@@ -302,14 +302,15 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
return;
if (ctx.info[idx].exec.back().second & mask_type_global) {
Temp exec_mask = ctx.info[idx].exec.back().first;
exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), exec_mask);
exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), exec_mask);
ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm);
return;
}
/* otherwise, the WQM mask should be one below the current mask */
ctx.info[idx].exec.pop_back();
assert(ctx.info[idx].exec.back().second & mask_type_wqm);
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
ctx.info[idx].exec.back().first);
}
@@ -324,14 +325,15 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
!(ctx.info[idx].exec.back().second & mask_type_loop)) {
ctx.info[idx].exec.pop_back();
assert(ctx.info[idx].exec.back().second & mask_type_exact);
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
ctx.info[idx].exec.back().first);
return;
}
/* otherwise, we create an exact mask and push to the stack */
Temp wqm = ctx.info[idx].exec.back().first;
Temp exact = bld.tmp(s2);
wqm = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
Temp exact = bld.tmp(bld.lm);
wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
bld.exec(Definition(exact)), ctx.info[idx].exec[0].first, bld.exec(wqm));
ctx.info[idx].exec.back().first = wqm;
ctx.info[idx].exec.emplace_back(exact, mask_type_exact);
@@ -359,7 +361,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
} else {
uint8_t mask = mask_type_global;
if (ctx.program->needs_wqm) {
exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), bld.exec(exec_mask));
exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), bld.exec(exec_mask));
mask |= mask_type_wqm;
} else {
mask |= mask_type_exact;
@@ -383,7 +385,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
aco_ptr<Pseudo_instruction> phi;
for (int i = 0; i < info.num_exec_masks - 1; i++) {
phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1));
phi->definitions[0] = bld.def(s2);
phi->definitions[0] = bld.def(bld.lm);
phi->operands[0] = Operand(ctx.info[preds[0]].exec[i].first);
ctx.info[idx].exec[i].first = bld.insert(std::move(phi));
}
@@ -393,7 +395,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
if (info.has_divergent_break) {
/* this phi might be trivial but ensures a parallelcopy on the loop header */
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
phi->definitions[0] = bld.def(s2);
phi->definitions[0] = bld.def(bld.lm);
phi->operands[0] = Operand(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first);
ctx.info[idx].exec.back().first = bld.insert(std::move(phi));
}
@@ -401,9 +403,9 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
/* create ssa name for loop active mask */
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
if (info.has_divergent_continue)
phi->definitions[0] = bld.def(s2);
phi->definitions[0] = bld.def(bld.lm);
else
phi->definitions[0] = bld.def(s2, exec);
phi->definitions[0] = bld.def(bld.lm, exec);
phi->operands[0] = Operand(ctx.info[preds[0]].exec.back().first);
Temp loop_active = bld.insert(std::move(phi));
@@ -423,7 +425,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
i++;
}
uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
ctx.info[idx].exec.back().first), mask_type);
}
@@ -480,7 +483,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
} else {
/* create phi for loop footer */
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
phi->definitions[0] = bld.def(s2);
phi->definitions[0] = bld.def(bld.lm);
for (unsigned i = 0; i < phi->operands.size(); i++)
phi->operands[i] = Operand(ctx.info[preds[i]].exec[k].first);
ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type);
@@ -510,7 +513,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
transition_to_Exact(ctx, bld, idx);
}
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec),
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec),
ctx.info[idx].exec.back().first);
ctx.loop.pop_back();
@@ -536,7 +540,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
continue;
}
Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(s2, exec) : bld.def(s2),
Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(bld.lm, exec) : bld.def(bld.lm),
ctx.info[preds[0]].exec[i].first,
ctx.info[preds[1]].exec[i].first);
uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
@@ -578,7 +582,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
if (block->kind & block_kind_merge) {
Temp restore = ctx.info[idx].exec.back().first;
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), restore);
assert(restore.size() == bld.lm.size());
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), restore);
}
return i;
@@ -589,7 +594,7 @@ void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptr<Instructi
Operand offset = instr->operands[1];
if (need_check) {
/* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */
Temp nonempty = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), cur_exec, Operand(0u));
Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u));
if (offset.isLiteral())
offset = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1), offset);
@@ -665,7 +670,7 @@ void process_instructions(exec_ctx& ctx, Block* block,
assert(num);
Operand cond = instr->operands[0];
for (int i = num - 1; i >= 0; i--) {
Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
ctx.info[block->index].exec[i].first, cond);
if (i == num - 1) {
andn2->operands[0].setFixed(exec);
@@ -689,8 +694,9 @@ void process_instructions(exec_ctx& ctx, Block* block,
if (instr->opcode == aco_opcode::p_is_helper || instr->opcode == aco_opcode::p_load_helper) {
Definition dst = instr->definitions[0];
assert(dst.size() == bld.lm.size());
if (state == Exact) {
instr.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b64, Format::SOP1, 1, 1));
instr.reset(create_instruction<SOP1_instruction>(bld.w64or32(Builder::s_mov), Format::SOP1, 1, 1));
instr->operands[0] = Operand(0u);
instr->definitions[0] = dst;
} else {
@@ -710,7 +716,7 @@ void process_instructions(exec_ctx& ctx, Block* block,
assert(instr->opcode == aco_opcode::p_is_helper || exact_mask.second & mask_type_initial);
assert(exact_mask.second & mask_type_exact);
instr.reset(create_instruction<SOP2_instruction>(aco_opcode::s_andn2_b64, Format::SOP2, 2, 2));
instr.reset(create_instruction<SOP2_instruction>(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2));
instr->operands[0] = Operand(ctx.info[block->index].exec.back().first); /* current exec */
instr->operands[1] = Operand(exact_mask.first);
instr->definitions[0] = dst;
@@ -726,8 +732,8 @@ void process_instructions(exec_ctx& ctx, Block* block,
if (instr->operands.empty()) {
/* transition to exact and set exec to zero */
Temp old_exec = ctx.info[block->index].exec.back().first;
Temp new_exec = bld.tmp(s2);
cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
Temp new_exec = bld.tmp(bld.lm);
cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
if (ctx.info[block->index].exec.back().second & mask_type_exact) {
ctx.info[block->index].exec.back().first = new_exec;
@@ -746,7 +752,7 @@ void process_instructions(exec_ctx& ctx, Block* block,
num += ctx.info[block->index].exec.size() - 1;
for (int i = num - 1; i >= 0; i--) {
if (ctx.info[block->index].exec[i].second & mask_type_exact) {
Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
ctx.info[block->index].exec[i].first, cond);
if (i == num - 1) {
andn2->operands[0].setFixed(exec);
@@ -878,13 +884,13 @@ void add_branch_code(exec_ctx& ctx, Block* block)
}
Temp old_exec = ctx.info[idx].exec.back().first;
Temp new_exec = bld.tmp(s2);
Temp cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
Temp new_exec = bld.tmp(bld.lm);
Temp cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
ctx.info[idx].exec.back().first = new_exec;
for (int i = num - 1; i >= 0; i--) {
Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
ctx.info[block->index].exec[i].first, cond);
if (i == (int)ctx.info[idx].exec.size() - 1)
andn2->definitions[0].setFixed(exec);
@@ -912,7 +918,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
Temp cond = Temp();
for (int exec_idx = ctx.info[idx].exec.size() - 1; exec_idx >= 0; exec_idx--) {
if (ctx.info[idx].exec[exec_idx].second & mask_type_loop) {
cond = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), ctx.info[idx].exec[exec_idx].first, Operand(0u));
cond = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), ctx.info[idx].exec[exec_idx].first, Operand(0u));
break;
}
}
@@ -957,8 +963,8 @@ void add_branch_code(exec_ctx& ctx, Block* block)
Temp current_exec = ctx.info[idx].exec.back().first;
uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
Temp then_mask = bld.tmp(s2);
Temp old_exec = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc),
Temp then_mask = bld.tmp(bld.lm);
Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
bld.exec(Definition(then_mask)), cond, bld.exec(current_exec));
ctx.info[idx].exec.back().first = old_exec;
@@ -978,7 +984,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
uint8_t mask_type = ctx.info[idx].exec.back().second;
ctx.info[idx].exec.pop_back();
Temp orig_exec = ctx.info[idx].exec.back().first;
Temp else_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2, exec),
Temp else_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm, exec),
bld.def(s1, scc), orig_exec, bld.exec(then_mask));
/* add next current exec to the stack */
@@ -998,7 +1004,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
cond = bld.tmp(s1);
Temp exec_mask = ctx.info[idx].exec[exec_idx].first;
exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)),
exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
exec_mask, current_exec);
ctx.info[idx].exec[exec_idx].first = exec_mask;
if (ctx.info[idx].exec[exec_idx].second & mask_type_loop)
@@ -1010,7 +1016,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
Block& succ = ctx.program->blocks[succ_idx];
if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u));
ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u));
}
bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
@@ -1028,7 +1034,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
break;
cond = bld.tmp(s1);
Temp exec_mask = ctx.info[idx].exec[exec_idx].first;
exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)),
exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
exec_mask, bld.exec(current_exec));
ctx.info[idx].exec[exec_idx].first = exec_mask;
}
@@ -1039,7 +1045,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
Block& succ = ctx.program->blocks[succ_idx];
if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u));
ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u));
}
bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);