ir3: Initial support for private memory
Add information that the driver will need to setup registers, and implement support for load_scratch/store_scratch using private memory. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7386>
This commit is contained in:
@@ -729,6 +729,11 @@ static int emit_cat6(struct ir3_instruction *instr, void *ptr,
|
||||
src2 = (instr->regs_count >= 3) ? instr->regs[2] : NULL;
|
||||
}
|
||||
|
||||
if ((instr->opc == OPC_STP || instr->opc == OPC_LDP) &&
|
||||
src2->iim_val * type_size(instr->cat6.type) > 32) {
|
||||
info->multi_dword_ldp_stp = true;
|
||||
}
|
||||
|
||||
/* TODO we need a more comprehensive list about which instructions
|
||||
* can be encoded which way. Or possibly use IR3_INSTR_0 flag to
|
||||
* indicate to use the src_off encoding even if offset is zero
|
||||
@@ -938,6 +943,7 @@ void * ir3_assemble(struct ir3_shader_variant *v)
|
||||
info->max_reg = -1;
|
||||
info->max_half_reg = -1;
|
||||
info->max_const = -1;
|
||||
info->multi_dword_ldp_stp = false;
|
||||
|
||||
uint32_t instr_count = 0;
|
||||
foreach_block (block, &shader->block_list) {
|
||||
@@ -1464,6 +1470,12 @@ ir3_valid_flags(struct ir3_instruction *instr, unsigned n,
|
||||
if ((instr->opc == OPC_STL) && (n != 2))
|
||||
return false;
|
||||
|
||||
if ((instr->opc == OPC_LDP) && (n == 0))
|
||||
return false;
|
||||
|
||||
if ((instr->opc == OPC_STP) && (n != 2))
|
||||
return false;
|
||||
|
||||
if (instr->opc == OPC_STLW && n == 0)
|
||||
return false;
|
||||
|
||||
|
||||
@@ -64,6 +64,7 @@ struct ir3_info {
|
||||
int8_t max_reg; /* highest GPR # used by shader */
|
||||
int8_t max_half_reg;
|
||||
int16_t max_const;
|
||||
bool multi_dword_ldp_stp;
|
||||
|
||||
/* number of sync bits: */
|
||||
uint16_t ss, sy;
|
||||
@@ -400,6 +401,8 @@ struct ir3_instruction {
|
||||
IR3_BARRIER_BUFFER_W = 1 << 6,
|
||||
IR3_BARRIER_ARRAY_R = 1 << 7,
|
||||
IR3_BARRIER_ARRAY_W = 1 << 8,
|
||||
IR3_BARRIER_PRIVATE_R = 1 << 9,
|
||||
IR3_BARRIER_PRIVATE_W = 1 << 10,
|
||||
} barrier_class, barrier_conflict;
|
||||
|
||||
/* Entry in ir3_block's instruction list: */
|
||||
@@ -1692,9 +1695,11 @@ INSTR2(LDLV)
|
||||
INSTR3(LDG)
|
||||
INSTR3(LDL)
|
||||
INSTR3(LDLW)
|
||||
INSTR3(LDP)
|
||||
INSTR3(STG)
|
||||
INSTR3(STL)
|
||||
INSTR3(STLW)
|
||||
INSTR3(STP)
|
||||
INSTR1(RESINFO)
|
||||
INSTR1(RESFMT)
|
||||
INSTR2(ATOMIC_ADD)
|
||||
|
||||
@@ -1052,6 +1052,57 @@ emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||
return atomic;
|
||||
}
|
||||
|
||||
/* src[] = { offset }. */
|
||||
static void
|
||||
emit_intrinsic_load_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr,
|
||||
struct ir3_instruction **dst)
|
||||
{
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *ldp, *offset;
|
||||
|
||||
offset = ir3_get_src(ctx, &intr->src[0])[0];
|
||||
|
||||
ldp = ir3_LDP(b, offset, 0,
|
||||
create_immed(b, intr->num_components), 0,
|
||||
create_immed(b, 0), 0);
|
||||
|
||||
ldp->cat6.type = utype_dst(intr->dest);
|
||||
ldp->regs[0]->wrmask = MASK(intr->num_components);
|
||||
|
||||
ldp->barrier_class = IR3_BARRIER_PRIVATE_R;
|
||||
ldp->barrier_conflict = IR3_BARRIER_PRIVATE_W;
|
||||
|
||||
ir3_split_dest(b, dst, ldp, 0, intr->num_components);
|
||||
}
|
||||
|
||||
/* src[] = { value, offset }. const_index[] = { write_mask } */
|
||||
static void
|
||||
emit_intrinsic_store_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||
{
|
||||
struct ir3_block *b = ctx->block;
|
||||
struct ir3_instruction *stp, *offset;
|
||||
struct ir3_instruction * const *value;
|
||||
unsigned wrmask, ncomp;
|
||||
|
||||
value = ir3_get_src(ctx, &intr->src[0]);
|
||||
offset = ir3_get_src(ctx, &intr->src[1])[0];
|
||||
|
||||
wrmask = nir_intrinsic_write_mask(intr);
|
||||
ncomp = ffs(~wrmask) - 1;
|
||||
|
||||
assert(wrmask == BITFIELD_MASK(intr->num_components));
|
||||
|
||||
stp = ir3_STP(b, offset, 0,
|
||||
ir3_create_collect(ctx, value, ncomp), 0,
|
||||
create_immed(b, ncomp), 0);
|
||||
stp->cat6.dst_offset = 0;
|
||||
stp->cat6.type = utype_src(intr->src[0]);
|
||||
stp->barrier_class = IR3_BARRIER_PRIVATE_W;
|
||||
stp->barrier_conflict = IR3_BARRIER_PRIVATE_R | IR3_BARRIER_PRIVATE_W;
|
||||
|
||||
array_insert(b, b->keeps, stp);
|
||||
}
|
||||
|
||||
struct tex_src_info {
|
||||
/* For prefetch */
|
||||
unsigned tex_base, samp_base, tex_idx, samp_idx;
|
||||
@@ -1714,6 +1765,12 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
|
||||
case nir_intrinsic_shared_atomic_comp_swap:
|
||||
dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
|
||||
break;
|
||||
case nir_intrinsic_load_scratch:
|
||||
emit_intrinsic_load_scratch(ctx, intr, dst);
|
||||
break;
|
||||
case nir_intrinsic_store_scratch:
|
||||
emit_intrinsic_store_scratch(ctx, intr);
|
||||
break;
|
||||
case nir_intrinsic_image_load:
|
||||
emit_intrinsic_load_image(ctx, intr, dst);
|
||||
break;
|
||||
@@ -3347,6 +3404,8 @@ emit_instructions(struct ir3_context *ctx)
|
||||
ctx->so->cull_mask = MASK(ctx->s->info.cull_distance_array_size) <<
|
||||
ctx->s->info.clip_distance_array_size;
|
||||
|
||||
ctx->so->pvtmem_size = ctx->s->scratch_size;
|
||||
|
||||
/* NOTE: need to do something more clever when we support >1 fxn */
|
||||
nir_foreach_register (reg, &fxn->registers) {
|
||||
ir3_declare_array(ctx, reg);
|
||||
|
||||
@@ -261,6 +261,7 @@ should_split_wrmask(const nir_instr *instr, const void *data)
|
||||
case nir_intrinsic_store_ssbo:
|
||||
case nir_intrinsic_store_shared:
|
||||
case nir_intrinsic_store_global:
|
||||
case nir_intrinsic_store_scratch:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
||||
@@ -144,6 +144,11 @@ void * ir3_shader_assemble(struct ir3_shader_variant *v)
|
||||
if (compiler->gpu_id >= 400)
|
||||
v->constlen = align(v->constlen, 4);
|
||||
|
||||
/* Use the per-wave layout by default on a6xx. It should result in better
|
||||
* performance when loads/stores are to a uniform index.
|
||||
*/
|
||||
v->pvtmem_per_wave = compiler->gpu_id >= 600 && !v->info.multi_dword_ldp_stp;
|
||||
|
||||
fixup_regfootprint(v);
|
||||
|
||||
return bin;
|
||||
|
||||
@@ -554,6 +554,11 @@ struct ir3_shader_variant {
|
||||
*/
|
||||
unsigned constlen;
|
||||
|
||||
/* The private memory size in bytes */
|
||||
unsigned pvtmem_size;
|
||||
/* Whether we should use the new per-wave layout rather than per-fiber. */
|
||||
bool pvtmem_per_wave;
|
||||
|
||||
/* About Linkage:
|
||||
* + Let the frag shader determine the position/compmask for the
|
||||
* varyings, since it is the place where we know if the varying
|
||||
|
||||
Reference in New Issue
Block a user