vc4: Add support for storing to NIR registers in a non-SSA fashion.

Previously, there were occasionally NIR registers in our programs, but
they were always actually used SSA-only.  Now that we're trying to support
control flow, we need to actually conditionally move to registers based on
whether channels are active or not.
This commit is contained in:
Eric Anholt
2016-04-27 16:01:24 -07:00
parent ab1d40b84a
commit f505f66cd5
2 changed files with 144 additions and 85 deletions
+132 -85
View File
@@ -151,6 +151,43 @@ ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
return qregs;
}
static void
ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan,
struct qreg result)
{
if (dest->is_ssa) {
assert(chan < dest->ssa.num_components);
struct qreg *qregs;
struct hash_entry *entry =
_mesa_hash_table_search(c->def_ht, &dest->ssa);
if (entry)
qregs = entry->data;
else
qregs = ntq_init_ssa_def(c, &dest->ssa);
qregs[chan] = result;
} else {
nir_register *reg = dest->reg.reg;
assert(dest->reg.base_offset == 0);
assert(reg->num_array_elems == 0);
struct hash_entry *entry =
_mesa_hash_table_search(c->def_ht, reg);
struct qreg *qregs = entry->data;
/* Conditionally move the result to the destination if the
* channel is active.
*/
if (c->execute.file != QFILE_NULL) {
qir_SF(c, c->execute);
qir_MOV_cond(c, QPU_COND_ZS, qregs[chan], result);
} else {
qir_MOV_dest(c, qregs[chan], result);
}
}
}
static struct qreg *
ntq_get_dest(struct vc4_compile *c, nir_dest *dest)
{
@@ -300,7 +337,7 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
struct qreg tex = qir_TEX_RESULT(c);
c->num_texture_samples++;
struct qreg *dest = ntq_get_dest(c, &instr->dest);
struct qreg dest[4];
enum pipe_format format = c->key->tex[unit].format;
if (util_format_is_depth_or_stencil(format)) {
struct qreg scaled = ntq_scale_depth_texture(c, tex);
@@ -310,6 +347,9 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
for (int i = 0; i < 4; i++)
dest[i] = qir_UNPACK_8_F(c, tex, i);
}
for (int i = 0; i < 4; i++)
ntq_store_dest(c, &instr->dest, i, dest[i]);
}
static void
@@ -731,10 +771,10 @@ ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
if (instr->src[0].swizzle[0] == instr->src[0].swizzle[1] &&
instr->src[0].swizzle[0] == instr->src[0].swizzle[2] &&
instr->src[0].swizzle[0] == instr->src[0].swizzle[3]) {
struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
*dest = qir_PACK_8888_F(c,
ntq_get_src(c, instr->src[0].src,
instr->src[0].swizzle[0]));
struct qreg rep = ntq_get_src(c,
instr->src[0].src,
instr->src[0].swizzle[0]);
ntq_store_dest(c, &instr->dest.dest, 0, qir_PACK_8888_F(c, rep));
return;
}
@@ -764,8 +804,7 @@ ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
qir_PACK_8_F(c, result, src, i);
}
struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
*dest = result;
ntq_store_dest(c, &instr->dest.dest, 0, result);
}
/** Handles sign-extended bitfield extracts for 16 bits. */
@@ -901,6 +940,9 @@ out:
static void
ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
{
/* This should always be lowered to ALU operations for VC4. */
assert(!instr->dest.saturate);
/* Vectors are special in that they have non-scalarized writemasks,
* and just take the first swizzle channel for each argument in order
* into each writemask channel.
@@ -912,9 +954,8 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
srcs[i] = ntq_get_src(c, instr->src[i].src,
instr->src[i].swizzle[0]);
struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
dest[i] = srcs[i];
ntq_store_dest(c, &instr->dest.dest, i, srcs[i]);
return;
}
@@ -926,10 +967,10 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
if (instr->op == nir_op_unpack_unorm_4x8) {
struct qreg src = ntq_get_src(c, instr->src[0].src,
instr->src[0].swizzle[0]);
struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
for (int i = 0; i < 4; i++) {
if (instr->dest.write_mask & (1 << i))
dest[i] = qir_UNPACK_8_F(c, src, i);
ntq_store_dest(c, &instr->dest.dest, i,
qir_UNPACK_8_F(c, src, i));
}
return;
}
@@ -940,91 +981,87 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
src[i] = ntq_get_alu_src(c, instr, i);
}
/* Pick the channel to store the output in. */
assert(!instr->dest.saturate);
struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
assert(util_is_power_of_two(instr->dest.write_mask));
dest += ffs(instr->dest.write_mask) - 1;
struct qreg result;
switch (instr->op) {
case nir_op_fmov:
case nir_op_imov:
*dest = qir_MOV(c, src[0]);
result = qir_MOV(c, src[0]);
break;
case nir_op_fmul:
*dest = qir_FMUL(c, src[0], src[1]);
result = qir_FMUL(c, src[0], src[1]);
break;
case nir_op_fadd:
*dest = qir_FADD(c, src[0], src[1]);
result = qir_FADD(c, src[0], src[1]);
break;
case nir_op_fsub:
*dest = qir_FSUB(c, src[0], src[1]);
result = qir_FSUB(c, src[0], src[1]);
break;
case nir_op_fmin:
*dest = qir_FMIN(c, src[0], src[1]);
result = qir_FMIN(c, src[0], src[1]);
break;
case nir_op_fmax:
*dest = qir_FMAX(c, src[0], src[1]);
result = qir_FMAX(c, src[0], src[1]);
break;
case nir_op_f2i:
case nir_op_f2u:
*dest = qir_FTOI(c, src[0]);
result = qir_FTOI(c, src[0]);
break;
case nir_op_i2f:
case nir_op_u2f:
*dest = qir_ITOF(c, src[0]);
result = qir_ITOF(c, src[0]);
break;
case nir_op_b2f:
*dest = qir_AND(c, src[0], qir_uniform_f(c, 1.0));
result = qir_AND(c, src[0], qir_uniform_f(c, 1.0));
break;
case nir_op_b2i:
*dest = qir_AND(c, src[0], qir_uniform_ui(c, 1));
result = qir_AND(c, src[0], qir_uniform_ui(c, 1));
break;
case nir_op_i2b:
case nir_op_f2b:
qir_SF(c, src[0]);
*dest = qir_SEL(c, QPU_COND_ZC,
qir_uniform_ui(c, ~0),
qir_uniform_ui(c, 0));
result = qir_SEL(c, QPU_COND_ZC,
qir_uniform_ui(c, ~0),
qir_uniform_ui(c, 0));
break;
case nir_op_iadd:
*dest = qir_ADD(c, src[0], src[1]);
result = qir_ADD(c, src[0], src[1]);
break;
case nir_op_ushr:
*dest = qir_SHR(c, src[0], src[1]);
result = qir_SHR(c, src[0], src[1]);
break;
case nir_op_isub:
*dest = qir_SUB(c, src[0], src[1]);
result = qir_SUB(c, src[0], src[1]);
break;
case nir_op_ishr:
*dest = qir_ASR(c, src[0], src[1]);
result = qir_ASR(c, src[0], src[1]);
break;
case nir_op_ishl:
*dest = qir_SHL(c, src[0], src[1]);
result = qir_SHL(c, src[0], src[1]);
break;
case nir_op_imin:
*dest = qir_MIN(c, src[0], src[1]);
result = qir_MIN(c, src[0], src[1]);
break;
case nir_op_imax:
*dest = qir_MAX(c, src[0], src[1]);
result = qir_MAX(c, src[0], src[1]);
break;
case nir_op_iand:
*dest = qir_AND(c, src[0], src[1]);
result = qir_AND(c, src[0], src[1]);
break;
case nir_op_ior:
*dest = qir_OR(c, src[0], src[1]);
result = qir_OR(c, src[0], src[1]);
break;
case nir_op_ixor:
*dest = qir_XOR(c, src[0], src[1]);
result = qir_XOR(c, src[0], src[1]);
break;
case nir_op_inot:
*dest = qir_NOT(c, src[0]);
result = qir_NOT(c, src[0]);
break;
case nir_op_imul:
*dest = ntq_umul(c, src[0], src[1]);
result = ntq_umul(c, src[0], src[1]);
break;
case nir_op_seq:
@@ -1040,90 +1077,90 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
case nir_op_ige:
case nir_op_uge:
case nir_op_ilt:
if (!ntq_emit_comparison(c, dest, instr, instr)) {
if (!ntq_emit_comparison(c, &result, instr, instr)) {
fprintf(stderr, "Bad comparison instruction\n");
}
break;
case nir_op_bcsel:
*dest = ntq_emit_bcsel(c, instr, src);
result = ntq_emit_bcsel(c, instr, src);
break;
case nir_op_fcsel:
qir_SF(c, src[0]);
*dest = qir_SEL(c, QPU_COND_ZC, src[1], src[2]);
result = qir_SEL(c, QPU_COND_ZC, src[1], src[2]);
break;
case nir_op_frcp:
*dest = ntq_rcp(c, src[0]);
result = ntq_rcp(c, src[0]);
break;
case nir_op_frsq:
*dest = ntq_rsq(c, src[0]);
result = ntq_rsq(c, src[0]);
break;
case nir_op_fexp2:
*dest = qir_EXP2(c, src[0]);
result = qir_EXP2(c, src[0]);
break;
case nir_op_flog2:
*dest = qir_LOG2(c, src[0]);
result = qir_LOG2(c, src[0]);
break;
case nir_op_ftrunc:
*dest = qir_ITOF(c, qir_FTOI(c, src[0]));
result = qir_ITOF(c, qir_FTOI(c, src[0]));
break;
case nir_op_fceil:
*dest = ntq_fceil(c, src[0]);
result = ntq_fceil(c, src[0]);
break;
case nir_op_ffract:
*dest = ntq_ffract(c, src[0]);
result = ntq_ffract(c, src[0]);
break;
case nir_op_ffloor:
*dest = ntq_ffloor(c, src[0]);
result = ntq_ffloor(c, src[0]);
break;
case nir_op_fsin:
*dest = ntq_fsin(c, src[0]);
result = ntq_fsin(c, src[0]);
break;
case nir_op_fcos:
*dest = ntq_fcos(c, src[0]);
result = ntq_fcos(c, src[0]);
break;
case nir_op_fsign:
*dest = ntq_fsign(c, src[0]);
result = ntq_fsign(c, src[0]);
break;
case nir_op_fabs:
*dest = qir_FMAXABS(c, src[0], src[0]);
result = qir_FMAXABS(c, src[0], src[0]);
break;
case nir_op_iabs:
*dest = qir_MAX(c, src[0],
result = qir_MAX(c, src[0],
qir_SUB(c, qir_uniform_ui(c, 0), src[0]));
break;
case nir_op_ibitfield_extract:
*dest = ntq_emit_ibfe(c, src[0], src[1], src[2]);
result = ntq_emit_ibfe(c, src[0], src[1], src[2]);
break;
case nir_op_ubitfield_extract:
*dest = ntq_emit_ubfe(c, src[0], src[1], src[2]);
result = ntq_emit_ubfe(c, src[0], src[1], src[2]);
break;
case nir_op_usadd_4x8:
*dest = qir_V8ADDS(c, src[0], src[1]);
result = qir_V8ADDS(c, src[0], src[1]);
break;
case nir_op_ussub_4x8:
*dest = qir_V8SUBS(c, src[0], src[1]);
result = qir_V8SUBS(c, src[0], src[1]);
break;
case nir_op_umin_4x8:
*dest = qir_V8MIN(c, src[0], src[1]);
result = qir_V8MIN(c, src[0], src[1]);
break;
case nir_op_umax_4x8:
*dest = qir_V8MAX(c, src[0], src[1]);
result = qir_V8MAX(c, src[0], src[1]);
break;
case nir_op_umul_unorm_4x8:
*dest = qir_V8MULD(c, src[0], src[1]);
result = qir_V8MULD(c, src[0], src[1]);
break;
default:
@@ -1132,6 +1169,13 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
fprintf(stderr, "\n");
abort();
}
/* We have a scalar result, so the instruction should only have a
* single channel written to.
*/
assert(util_is_power_of_two(instr->dest.write_mask));
ntq_store_dest(c, &instr->dest.dest,
ffs(instr->dest.write_mask) - 1, result);
}
static void
@@ -1473,7 +1517,7 @@ ntq_setup_registers(struct vc4_compile *c, struct exec_list *list)
_mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
for (int i = 0; i < array_len * nir_reg->num_components; i++)
qregs[i] = qir_uniform_ui(c, 0);
qregs[i] = qir_get_temp(c);
}
}
@@ -1502,14 +1546,8 @@ ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr)
static void
ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
{
const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
nir_const_value *const_offset;
unsigned offset;
struct qreg *dest = NULL;
if (info->has_dest) {
dest = ntq_get_dest(c, &instr->dest);
}
switch (instr->intrinsic) {
case nir_intrinsic_load_uniform:
@@ -1521,36 +1559,43 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
/* We need dwords */
offset = offset / 4;
if (offset < VC4_NIR_STATE_UNIFORM_OFFSET) {
*dest = qir_uniform(c, QUNIFORM_UNIFORM,
offset);
ntq_store_dest(c, &instr->dest, 0,
qir_uniform(c, QUNIFORM_UNIFORM,
offset));
} else {
*dest = qir_uniform(c, offset -
VC4_NIR_STATE_UNIFORM_OFFSET,
0);
ntq_store_dest(c, &instr->dest, 0,
qir_uniform(c, offset -
VC4_NIR_STATE_UNIFORM_OFFSET,
0));
}
} else {
*dest = indirect_uniform_load(c, instr);
ntq_store_dest(c, &instr->dest, 0,
indirect_uniform_load(c, instr));
}
break;
case nir_intrinsic_load_user_clip_plane:
for (int i = 0; i < instr->num_components; i++) {
dest[i] = qir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
instr->const_index[0] * 4 + i);
ntq_store_dest(c, &instr->dest, i,
qir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
instr->const_index[0] * 4 +
i));
}
break;
case nir_intrinsic_load_sample_mask_in:
*dest = qir_uniform(c, QUNIFORM_SAMPLE_MASK, 0);
ntq_store_dest(c, &instr->dest, 0,
qir_uniform(c, QUNIFORM_SAMPLE_MASK, 0));
break;
case nir_intrinsic_load_front_face:
/* The register contains 0 (front) or 1 (back), and we need to
* turn it into a NIR bool where true means front.
*/
*dest = qir_ADD(c,
qir_uniform_ui(c, -1),
qir_reg(QFILE_FRAG_REV_FLAG, 0));
ntq_store_dest(c, &instr->dest, 0,
qir_ADD(c,
qir_uniform_ui(c, -1),
qir_reg(QFILE_FRAG_REV_FLAG, 0)));
break;
case nir_intrinsic_load_input:
@@ -1570,10 +1615,12 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
qir_TLB_COLOR_READ(c);
}
}
*dest = c->color_reads[sample_index];
ntq_store_dest(c, &instr->dest, 0,
c->color_reads[sample_index]);
} else {
offset = instr->const_index[0] + const_offset->u32[0];
*dest = c->inputs[offset];
ntq_store_dest(c, &instr->dest, 0,
c->inputs[offset]);
}
break;
+12
View File
@@ -408,6 +408,11 @@ struct vc4_compile {
uint32_t num_ubo_ranges;
uint32_t next_ubo_dst_offset;
/* State for whether we're executing on each channel currently. 0 if
* yes, otherwise a block number + 1 that the channel jumped to.
*/
struct qreg execute;
struct qreg line_x, point_x, point_y;
struct qreg discard;
struct qreg payload_FRAG_Z;
@@ -760,6 +765,13 @@ qir_LOAD_IMM(struct vc4_compile *c, uint32_t val)
qir_reg(QFILE_LOAD_IMM, val), c->undef));
}
static inline void
qir_MOV_cond(struct vc4_compile *c, uint8_t cond,
struct qreg dest, struct qreg src)
{
qir_MOV_dest(c, dest, src)->cond = cond;
}
static inline struct qinst *
qir_BRANCH(struct vc4_compile *c, uint8_t cond)
{