aco: Introduce new VMEM load/store helpers.

These are going to be used for loading and storing inputs
and outputs in various stages, such as GS, TCS and TES.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3964>
This commit is contained in:
Timur Kristóf
2020-02-28 15:48:01 +01:00
committed by Marge Bot
parent 4fc1da208e
commit 19d5dc9cee
@@ -2893,6 +2893,175 @@ unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
return align;
}
Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned split_cnt = 0u, Temp dst = Temp())
{
Builder bld(ctx->program, ctx->block);
if (!dst.id())
dst = bld.tmp(RegClass(reg_type, cnt * arr[0].size()));
std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
aco_ptr<Pseudo_instruction> instr {create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
instr->definitions[0] = Definition(dst);
for (unsigned i = 0; i < cnt; ++i) {
assert(arr[i].size() == arr[0].size());
allocated_vec[i] = arr[i];
instr->operands[i] = Operand(arr[i]);
}
bld.insert(std::move(instr));
if (split_cnt)
emit_split_vector(ctx, dst, split_cnt);
else
ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
return dst;
}
inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, unsigned const_offset)
{
if (const_offset >= 4096) {
unsigned excess_const_offset = const_offset / 4096u * 4096u;
const_offset %= 4096u;
if (!voffset.id())
voffset = bld.copy(bld.def(v1), Operand(excess_const_offset));
else if (unlikely(voffset.regClass() == s1))
voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), Operand(excess_const_offset), Operand(voffset));
else if (likely(voffset.regClass() == v1))
voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand(excess_const_offset));
else
unreachable("Unsupported register class of voffset");
}
return const_offset;
}
void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false)
{
assert(vdata.id());
assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
assert(vdata.size() >= 1 && vdata.size() <= 4);
Builder bld(ctx->program, ctx->block);
aco_opcode op = (aco_opcode) ((unsigned) aco_opcode::buffer_store_dword + vdata.size() - 1);
const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
/* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
/* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc);
static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
}
void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
bool allow_combining = true, bool reorder = true, bool slc = false)
{
Builder bld(ctx->program, ctx->block);
assert(elem_size_bytes == 4 || elem_size_bytes == 8);
assert(write_mask);
if (elem_size_bytes == 8) {
elem_size_bytes = 4;
write_mask = widen_mask(write_mask, 2);
}
while (write_mask) {
int start = 0;
int count = 0;
u_bit_scan_consecutive_range(&write_mask, &start, &count);
assert(count > 0);
assert(start >= 0);
while (count > 0) {
unsigned sub_count = allow_combining ? MIN2(count, 4) : 1;
unsigned const_offset = (unsigned) start * elem_size_bytes + base_const_offset;
/* GFX6 doesn't have buffer_store_dwordx3, so make sure not to emit that here either. */
if (unlikely(ctx->program->chip_class == GFX6 && sub_count == 3))
sub_count = 2;
Temp elem = extract_subvector(ctx, src, start, sub_count, RegType::vgpr);
emit_single_mubuf_store(ctx, descriptor, voffset, soffset, elem, const_offset, reorder, slc);
count -= sub_count;
start += sub_count;
}
assert(count == 0);
}
}
Temp emit_single_mubuf_load(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset,
unsigned const_offset, unsigned size_dwords, bool allow_reorder = true)
{
assert(size_dwords != 3 || ctx->program->chip_class != GFX6);
assert(size_dwords >= 1 && size_dwords <= 4);
Builder bld(ctx->program, ctx->block);
Temp vdata = bld.tmp(RegClass(RegType::vgpr, size_dwords));
aco_opcode op = (aco_opcode) ((unsigned) aco_opcode::buffer_load_dword + size_dwords - 1);
const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
Builder::Result r = bld.mubuf(op, Definition(vdata), Operand(descriptor), voffset_op, soffset_op, const_offset,
/* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
/* disable_wqm */ false, /* glc */ true,
/* dlc*/ ctx->program->chip_class >= GFX10, /* slc */ false);
static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
return vdata;
}
void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true)
{
assert(elem_size_bytes == 4 || elem_size_bytes == 8);
assert((num_components * elem_size_bytes / 4) == dst.size());
assert(!!stride != allow_combining);
Builder bld(ctx->program, ctx->block);
unsigned split_cnt = num_components;
if (elem_size_bytes == 8) {
elem_size_bytes = 4;
num_components *= 2;
}
if (!stride)
stride = elem_size_bytes;
unsigned load_size = 1;
if (allow_combining) {
if ((num_components % 4) == 0)
load_size = 4;
else if ((num_components % 3) == 0 && ctx->program->chip_class != GFX6)
load_size = 3;
else if ((num_components % 2) == 0)
load_size = 2;
}
unsigned num_loads = num_components / load_size;
std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
for (unsigned i = 0; i < num_loads; ++i) {
unsigned const_offset = i * stride * load_size + base_const_offset;
elems[i] = emit_single_mubuf_load(ctx, descriptor, voffset, soffset, const_offset, load_size, allow_reorder);
}
create_vec_from_array(ctx, elems.data(), num_loads, RegType::vgpr, split_cnt, dst);
}
void visit_store_vsgs_output(isel_context *ctx, nir_intrinsic_instr *instr)
{
unsigned write_mask = nir_intrinsic_write_mask(instr);