diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c index 39507184105..05bcdcc60b1 100644 --- a/src/freedreno/ir3/ir3_ra.c +++ b/src/freedreno/ir3/ir3_ra.c @@ -346,6 +346,9 @@ struct ir3_ra_ctx { unsigned *def, *use; /* def/use table */ struct ir3_ra_instr_data *instrd; + /* Mapping vreg name back to instruction, used select reg callback: */ + struct hash_table *name_to_instr; + /* Tracking for max half/full register assigned. We don't need to * track high registers. * @@ -354,8 +357,14 @@ struct ir3_ra_ctx { */ unsigned max_assigned; unsigned max_half_assigned; + + /* Tracking for select_reg callback */ + unsigned start_search_reg; + unsigned max_target; }; +static int scalar_name(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, unsigned n); + /* does it conflict? */ static inline bool intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end) @@ -640,6 +649,101 @@ ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block) } } +static int +pick_in_range(BITSET_WORD *regs, unsigned min, unsigned max) +{ + for (unsigned i = min; i < max; i++) { + if (BITSET_TEST(regs, i)) { + return i; + } + } + return -1; +} + +/* register selector for the a6xx+ merged register file: */ +static unsigned int +ra_select_reg_merged(unsigned int n, BITSET_WORD *regs, void *data) +{ + struct ir3_ra_ctx *ctx = data; + unsigned int class = ra_get_node_class(ctx->g, n); + + /* dimensions within the register class: */ + unsigned max_target, start; + + /* the regs bitset will include *all* of the virtual regs, but we lay + * out the different classes consecutively in the virtual register + * space. So we just need to think about the base offset of a given + * class within the virtual register space, and offset the register + * space we search within by that base offset. + */ + unsigned base; + + /* NOTE: this is only used in scalar pass, so the register + * class will be one of the scalar classes (ie. idx==0): + */ + if (class == ctx->set->high_classes[0]) { + max_target = HIGH_CLASS_REGS(0); + start = 0; + base = ctx->set->gpr_to_ra_reg[HIGH_OFFSET][0]; + } else if (class == ctx->set->half_classes[0]) { + max_target = ctx->max_target; + start = ctx->start_search_reg; + base = ctx->set->gpr_to_ra_reg[HALF_OFFSET][0]; + } else if (class == ctx->set->classes[0]) { + max_target = ctx->max_target / 2; + start = ctx->start_search_reg; + base = ctx->set->gpr_to_ra_reg[0][0]; + } else { + unreachable("unexpected register class!"); + } + + /* For cat4 instructions, if the src reg is already assigned, and + * avail to pick, use it. Because this doesn't introduce unnecessary + * dependencies, and it potentially avoids needing (ss) syncs to + * for write after read hazards: + */ + struct hash_entry *entry = _mesa_hash_table_search(ctx->name_to_instr, &n); + if (entry) { + struct ir3_instruction *instr = entry->data; + + if (is_sfu(instr) && instr->regs[1]->instr) { + struct ir3_instruction *src = instr->regs[1]->instr; + unsigned src_n = scalar_name(ctx, src, 0); + + unsigned reg = ra_get_node_reg(ctx->g, src_n); + + /* Check if the src register has been assigned yet: */ + if (reg != NO_REG) { + if (BITSET_TEST(regs, reg)) { + return reg; + } + } + } + } + + int r = pick_in_range(regs, base + start, base + max_target); + if (r < 0) { + /* wrap-around: */ + r = pick_in_range(regs, base, base + start); + } + + if (r < 0) { + /* overflow, we need to increase max_target: */ + ctx->max_target++; + return ra_select_reg_merged(n, regs, data); + } + + if (class == ctx->set->half_classes[0]) { + int n = r - base; + ctx->start_search_reg = (n + 1) % ctx->max_target; + } else if (class == ctx->set->classes[0]) { + int n = (r - base) * 2; + ctx->start_search_reg = (n + 1) % ctx->max_target; + } + + return r; +} + static void ra_init(struct ir3_ra_ctx *ctx) { @@ -680,6 +784,14 @@ ra_init(struct ir3_ra_ctx *ctx) ralloc_steal(ctx->g, ctx->instrd); ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count); + + /* TODO add selector callback for split (pre-a6xx) register file: */ + if (ctx->scalar_pass && (ctx->ir->compiler->gpu_id >= 600)) { + ra_set_select_reg_callback(ctx->g, ra_select_reg_merged, ctx); + + ctx->name_to_instr = _mesa_hash_table_create(ctx->g, + _mesa_hash_int, _mesa_key_int_equal); + } } static unsigned @@ -837,6 +949,16 @@ ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block) def(name, instr); + if (ctx->name_to_instr && is_sfu(instr)) { + /* this is slightly annoying, we can't just use an + * integer on the stack + */ + unsigned *key = ralloc(ctx->name_to_instr, unsigned); + *key = name; + debug_assert(!_mesa_hash_table_search(ctx->name_to_instr, key)); + _mesa_hash_table_insert(ctx->name_to_instr, key, instr); + } + if ((instr->opc == OPC_META_INPUT) && first_non_input) use(name, first_non_input); @@ -1536,9 +1658,32 @@ ra_sanity_check(struct ir3 *ir) } } +/* Target is calculated in terms of half-regs (with a full reg + * consisting of two half-regs). + */ +static void +ra_calc_merged_register_target(struct ir3_ra_ctx *ctx) +{ + const unsigned vec4 = 2 * 4; // 8 half-regs + unsigned t = MAX2(2 * ctx->max_assigned, ctx->max_half_assigned); + + /* second RA pass may have saved some regs, let's try to reclaim + * the benefit by adjusting the target downwards slightly: + */ + if (ir3_has_latency_to_hide(ctx->ir)) { + if (t > 8 * vec4) { + t -= 2 * vec4; + } else if (t > 6 * vec4) { + t -= vec4; + } + } + + ctx->max_target = t; +} + static int ir3_ra_pass(struct ir3_shader_variant *v, struct ir3_instruction **precolor, - unsigned nprecolor, bool scalar_pass) + unsigned nprecolor, bool scalar_pass, unsigned *target) { struct ir3_ra_ctx ctx = { .v = v, @@ -1548,6 +1693,10 @@ ir3_ra_pass(struct ir3_shader_variant *v, struct ir3_instruction **precolor, }; int ret; + if (scalar_pass) { + ctx.max_target = *target; + } + ra_init(&ctx); ra_add_interference(&ctx); ra_precolor(&ctx, precolor, nprecolor); @@ -1556,7 +1705,16 @@ ir3_ra_pass(struct ir3_shader_variant *v, struct ir3_instruction **precolor, ret = ra_alloc(&ctx); ra_destroy(&ctx); - printf("#### max_assigned=%u, max_half_assigned=%u\n", ctx.max_assigned, ctx.max_half_assigned); + /* In the first pass, calculate the target register usage used in the + * second (scalar) pass: + */ + if (!scalar_pass) { + /* TODO: round-robin support for pre-a6xx: */ + if (ctx.ir->compiler->gpu_id >= 600) { + ra_calc_merged_register_target(&ctx); + } + *target = ctx.max_target; + } return ret; } @@ -1565,10 +1723,11 @@ int ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor, unsigned nprecolor) { + unsigned target = 0; int ret; /* First pass, assign the vecN (non-scalar) registers: */ - ret = ir3_ra_pass(v, precolor, nprecolor, false); + ret = ir3_ra_pass(v, precolor, nprecolor, false, &target); if (ret) return ret; @@ -1578,7 +1737,7 @@ ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor, } /* Second pass, assign the scalar registers: */ - ret = ir3_ra_pass(v, precolor, nprecolor, true); + ret = ir3_ra_pass(v, precolor, nprecolor, true, &target); if (ret) return ret;