diff --git a/src/gallium/drivers/lima/ir/gp/gpir.h b/src/gallium/drivers/lima/ir/gp/gpir.h index 6a1d533977b..de571c3c1c4 100644 --- a/src/gallium/drivers/lima/ir/gp/gpir.h +++ b/src/gallium/drivers/lima/ir/gp/gpir.h @@ -174,6 +174,7 @@ typedef struct gpir_node { bool ready; bool inserted; bool max_node, next_max_node; + bool complex_allowed; } sched; struct { int parent_index; @@ -288,16 +289,22 @@ typedef struct gpir_instr { * (3) There is a store instruction scheduled, but not its child. * * The complex slot cannot be used for a move in case (1), since it only - * has a FIFO depth of 1, but it can be used for (2) and (3). In order to - * ensure that we have enough space for all three, we maintain the - * following invariants: + * has a FIFO depth of 1, but it can be used for (2) as well as (3) as long + * as the uses aren't in certain slots. It turns out that we don't have to + * worry about nodes that can't use the complex slot for (2), since there + * are at most 4 uses 1 cycle ago that can't use the complex slot, but we + * do have to worry about (3). This means tracking stores whose children + * cannot be in the complex slot. In order to ensure that we have enough + * space for all three, we maintain the following invariants: * * (1) alu_num_slot_free >= alu_num_slot_needed_by_store + * alu_num_slot_needed_by_max + * alu_num_slot_needed_by_next_max - * (2) alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + * (2) alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + + * alu_num_slot_neede_by_non_cplx_store */ int alu_num_slot_needed_by_store; + int alu_num_slot_needed_by_non_cplx_store; int alu_num_slot_needed_by_max; int alu_num_slot_needed_by_next_max; diff --git a/src/gallium/drivers/lima/ir/gp/instr.c b/src/gallium/drivers/lima/ir/gp/instr.c index 228d1459280..e07a2c9b7c2 100644 --- a/src/gallium/drivers/lima/ir/gp/instr.c +++ b/src/gallium/drivers/lima/ir/gp/instr.c @@ -85,10 +85,15 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node) if (!gpir_instr_check_acc_same_op(instr, node, node->sched.pos)) return false; + if (node->sched.next_max_node && !node->sched.complex_allowed && + node->sched.pos == GPIR_INSTR_SLOT_COMPLEX) + return false; + int consume_slot = gpir_instr_get_consume_slot(instr, node); int non_cplx_consume_slot = node->sched.pos == GPIR_INSTR_SLOT_COMPLEX ? 0 : consume_slot; int store_reduce_slot = 0; + int non_cplx_store_reduce_slot = 0; int max_reduce_slot = node->sched.max_node ? 1 : 0; int next_max_reduce_slot = node->sched.next_max_node ? 1 : 0; @@ -100,6 +105,8 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node) gpir_store_node *s = gpir_node_to_store(instr->slots[i]); if (s && s->child == node) { store_reduce_slot = 1; + if (node->sched.next_max_node && !node->sched.complex_allowed) + non_cplx_store_reduce_slot = 1; break; } } @@ -118,7 +125,8 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node) } int non_cplx_slot_difference = - instr->alu_num_slot_needed_by_max - max_reduce_slot - + instr->alu_num_slot_needed_by_max - max_reduce_slot + + instr->alu_num_slot_needed_by_non_cplx_store - non_cplx_store_reduce_slot - (instr->alu_non_cplx_slot_free - non_cplx_consume_slot); if (non_cplx_slot_difference > 0) { gpir_debug("failed %d because of alu slot\n", node->index); @@ -131,6 +139,7 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node) instr->alu_num_slot_free -= consume_slot; instr->alu_non_cplx_slot_free -= non_cplx_consume_slot; instr->alu_num_slot_needed_by_store -= store_reduce_slot; + instr->alu_num_slot_needed_by_non_cplx_store -= non_cplx_store_reduce_slot; instr->alu_num_slot_needed_by_max -= max_reduce_slot; instr->alu_num_slot_needed_by_next_max -= next_max_reduce_slot; return true; @@ -144,6 +153,8 @@ static void gpir_instr_remove_alu(gpir_instr *instr, gpir_node *node) gpir_store_node *s = gpir_node_to_store(instr->slots[i]); if (s && s->child == node) { instr->alu_num_slot_needed_by_store++; + if (node->sched.next_max_node && !node->sched.complex_allowed) + instr->alu_num_slot_needed_by_non_cplx_store++; break; } } @@ -296,7 +307,7 @@ static bool gpir_instr_insert_store_check(gpir_instr *instr, gpir_node *node) } /* Check the invariants documented in gpir.h, similar to the ALU case. - * Since the only thing that changes is alu_num_slot_needed_by_store, we + * When the only thing that changes is alu_num_slot_needed_by_store, we * can get away with just checking the first one. */ int slot_difference = instr->alu_num_slot_needed_by_store + 1 @@ -308,6 +319,25 @@ static bool gpir_instr_insert_store_check(gpir_instr *instr, gpir_node *node) return false; } + if (store->child->sched.next_max_node && + !store->child->sched.complex_allowed) { + /* The child of the store is already partially ready, and has a use one + * cycle ago that disqualifies it (or a move replacing it) from being + * put in the complex slot. Therefore we have to check the non-complex + * invariant. + */ + int non_cplx_slot_difference = + instr->alu_num_slot_needed_by_max + + instr->alu_num_slot_needed_by_non_cplx_store + 1 - + instr->alu_non_cplx_slot_free; + if (non_cplx_slot_difference > 0) { + instr->non_cplx_slot_difference = non_cplx_slot_difference; + return false; + } + + instr->alu_num_slot_needed_by_non_cplx_store++; + } + instr->alu_num_slot_needed_by_store++; out: @@ -346,6 +376,11 @@ static void gpir_instr_remove_store(gpir_instr *instr, gpir_node *node) instr->alu_num_slot_needed_by_store--; + if (store->child->sched.next_max_node && + !store->child->sched.complex_allowed) { + instr->alu_num_slot_needed_by_non_cplx_store--; + } + out: if (!instr->slots[other_slot]) instr->store_content[component >> 1] = GPIR_INSTR_STORE_NONE; diff --git a/src/gallium/drivers/lima/ir/gp/scheduler.c b/src/gallium/drivers/lima/ir/gp/scheduler.c index 60076daa3c4..7149e1dd213 100644 --- a/src/gallium/drivers/lima/ir/gp/scheduler.c +++ b/src/gallium/drivers/lima/ir/gp/scheduler.c @@ -640,6 +640,7 @@ static gpir_node *create_move(sched_ctx *ctx, gpir_node *node) move->node.sched.dist = node->sched.dist; move->node.sched.max_node = node->sched.max_node; move->node.sched.next_max_node = node->sched.next_max_node; + move->node.sched.complex_allowed = node->sched.complex_allowed; gpir_debug("create move %d for %d\n", move->node.index, node->index); @@ -974,6 +975,7 @@ static bool try_spill_node(sched_ctx *ctx, gpir_node *node) store->child = node; store->node.sched.max_node = false; store->node.sched.next_max_node = false; + store->node.sched.complex_allowed = false; store->node.sched.pos = -1; store->node.sched.instr = NULL; store->node.sched.inserted = false; @@ -1086,6 +1088,44 @@ static int gpir_get_min_end_as_move(gpir_node *node) return min; } +/* The second source for add0, add1, mul0, and mul1 units cannot be complex. + * The hardware overwrites the add second sources with 0 and mul second + * sources with 1. This can be a problem if we need to insert more next-max + * moves but we only have values that can't use the complex unit for moves. + * + * Fortunately, we only need to insert a next-max move if there are more than + * 5 next-max nodes, but there are only 4 sources in the previous instruction + * that make values not complex-capable, which means there can be at most 4 + * non-complex-capable values. Hence there will always be at least two values + * that can be rewritten to use a move in the complex slot. However, we have + * to be careful not to waste those values by putting both of them in a + * non-complex slot. This is handled for us by gpir_instr, which will reject + * such instructions. We just need to tell it which nodes can use complex, and + * it will do the accounting to figure out what is safe. + */ + +static bool can_use_complex(gpir_node *node) +{ + gpir_node_foreach_succ(node, dep) { + if (dep->type != GPIR_DEP_INPUT) + continue; + + gpir_node *succ = dep->succ; + if (succ->type != gpir_node_type_alu) + continue; + + gpir_alu_node *alu = gpir_node_to_alu(succ); + if (alu->num_child >= 2 && alu->children[1] == node) + return false; + + /* complex1 puts its third source in the fourth slot */ + if (alu->node.op == gpir_op_complex1 && alu->children[2] == node) + return false; + } + + return true; +} + /* Initialize node->sched.max_node and node->sched.next_max_node for every * input node on the ready list. We should only need to do this once per * instruction, at the beginning, since we never add max nodes to the ready @@ -1104,6 +1144,8 @@ static void sched_find_max_nodes(sched_ctx *ctx) int min_end_move = gpir_get_min_end_as_move(node); node->sched.max_node = (min_end_move == ctx->instr->index); node->sched.next_max_node = (min_end_move == ctx->instr->index + 1); + if (node->sched.next_max_node) + node->sched.complex_allowed = can_use_complex(node); if (node->sched.max_node) ctx->instr->alu_num_slot_needed_by_max++; @@ -1120,6 +1162,7 @@ static void verify_max_nodes(sched_ctx *ctx) int alu_num_slot_needed_by_max = 0; int alu_num_slot_needed_by_next_max = -5; int alu_num_slot_needed_by_store = 0; + int alu_num_slot_needed_by_non_cplx_store = 0; list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { if (!gpir_is_input_node(node)) @@ -1129,15 +1172,20 @@ static void verify_max_nodes(sched_ctx *ctx) alu_num_slot_needed_by_max++; if (node->sched.next_max_node) alu_num_slot_needed_by_next_max++; - if (used_by_store(node, ctx->instr)) + if (used_by_store(node, ctx->instr)) { alu_num_slot_needed_by_store++; + if (node->sched.next_max_node && !node->sched.complex_allowed) + alu_num_slot_needed_by_non_cplx_store++; + } } assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max); assert(ctx->instr->alu_num_slot_needed_by_next_max == alu_num_slot_needed_by_next_max); assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store); - assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + alu_num_slot_needed_by_next_max); - assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max); + assert(ctx->instr->alu_num_slot_needed_by_non_cplx_store == + alu_num_slot_needed_by_non_cplx_store); + assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + MAX2(alu_num_slot_needed_by_next_max, 0)); + assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + alu_num_slot_needed_by_non_cplx_store); } static bool try_node(sched_ctx *ctx) @@ -1207,6 +1255,22 @@ static void place_move(sched_ctx *ctx, gpir_node *node) assert(score != INT_MIN); } +/* For next-max nodes, not every node can be offloaded to a move in the + * complex slot. If we run out of non-complex slots, then such nodes cannot + * have moves placed for them. There should always be sufficient + * complex-capable nodes so that this isn't a problem. + */ +static bool can_place_move(sched_ctx *ctx, gpir_node *node) +{ + if (!node->sched.next_max_node) + return true; + + if (node->sched.complex_allowed) + return true; + + return ctx->instr->alu_non_cplx_slot_free > 0; +} + static bool sched_move(sched_ctx *ctx) { list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { @@ -1250,6 +1314,9 @@ static bool sched_move(sched_ctx *ctx) */ if (ctx->instr->alu_num_slot_free > 0) { list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { + if (!can_place_move(ctx, node)) + continue; + if (node->sched.next_max_node && node->op == gpir_op_complex1 && node->sched.ready) { bool skip = true; @@ -1284,6 +1351,9 @@ static bool sched_move(sched_ctx *ctx) */ if (ctx->instr->alu_num_slot_free > 0) { list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) { + if (!can_place_move(ctx, node)) + continue; + if (node->sched.next_max_node && !(node->op == gpir_op_complex1 && node->sched.ready)) { place_move(ctx, node); @@ -1298,6 +1368,9 @@ static bool sched_move(sched_ctx *ctx) if (ctx->instr->alu_num_slot_needed_by_next_max > 0) { list_for_each_entry(gpir_node, node, &ctx->ready_list, list) { + if (!can_place_move(ctx, node)) + continue; + if (node->sched.next_max_node) { place_move(ctx, node); return true; @@ -1546,6 +1619,7 @@ bool gpir_schedule_prog(gpir_compiler *comp) node->sched.physreg_store = NULL; node->sched.ready = false; node->sched.inserted = false; + node->sched.complex_allowed = false; node->sched.max_node = false; node->sched.next_max_node = false; }