v3d: Rotate through registers to improve post-RA scheduling options.

Similarly to VC4's implementation, by not picking r0 immediately upon
freeing it, we give the scheduler more of a chance to fit later writes in
earlier.  I'm not clear on whether there's any real cost to picking phys
over accumulators, so keep that behavior for now.

shader-db:
total instructions in shared programs: 96831 -> 95669 (-1.20%)
instructions in affected programs:     77254 -> 76092 (-1.50%)
This commit is contained in:
Eric Anholt
2018-07-20 12:05:57 -07:00
parent 1fb31819ae
commit 8dfc6ee317
@@ -238,6 +238,43 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
BITSET_CLEAR(c->spillable, i);
}
struct v3d_ra_select_callback_data {
uint32_t next_acc;
uint32_t next_phys;
};
static unsigned int
v3d_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data)
{
struct v3d_ra_select_callback_data *v3d_ra = data;
/* Choose an accumulator if possible (I think it's lower power than
* phys regs), but round-robin through them to give post-RA
* instruction selection more options.
*/
for (int i = 0; i < ACC_COUNT; i++) {
int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT;
int acc = ACC_INDEX + acc_off;
if (BITSET_TEST(regs, acc)) {
v3d_ra->next_acc = acc_off + 1;
return acc;
}
}
for (int i = 0; i < PHYS_COUNT; i++) {
int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
int phys = PHYS_INDEX + phys_off;
if (BITSET_TEST(regs, phys)) {
v3d_ra->next_phys = phys_off + 1;
return phys;
}
}
unreachable("RA must pass us at least one possible reg.");
}
bool
vir_init_reg_sets(struct v3d_compiler *compiler)
{
@@ -309,6 +346,13 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
struct qpu_reg *temp_registers = calloc(c->num_temps,
sizeof(*temp_registers));
int acc_nodes[ACC_COUNT];
struct v3d_ra_select_callback_data callback_data = {
.next_acc = 0,
/* Start at RF3, to try to keep the TLB writes from using
* RF0-2.
*/
.next_phys = 3,
};
*spilled = false;
@@ -328,6 +372,7 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
c->num_temps +
ARRAY_SIZE(acc_nodes));
ra_set_select_reg_callback(g, v3d_ra_select_callback, &callback_data);
/* Make some fixed nodes for the accumulators, which we will need to
* interfere with when ops have implied r3/r4 writes or for the thread