diff --git a/src/gallium/drivers/lima/ci/deqp-lima-fails.txt b/src/gallium/drivers/lima/ci/deqp-lima-fails.txt index 7bc535e8335..680b8f247fd 100644 --- a/src/gallium/drivers/lima/ci/deqp-lima-fails.txt +++ b/src/gallium/drivers/lima/ci/deqp-lima-fails.txt @@ -37,14 +37,6 @@ dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_loop_write_ dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_loop_write_dynamic_read_vertex,Fail dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_loop_write_static_loop_read_vertex,Fail dEQP-GLES2.functional.shaders.indexing.matrix_subscript.mat4_dynamic_loop_write_static_read_vertex,Fail -dEQP-GLES2.functional.shaders.indexing.tmp_array.float_const_write_dynamic_loop_read_vertex,Fail -dEQP-GLES2.functional.shaders.indexing.tmp_array.float_const_write_dynamic_read_vertex,Fail -dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_const_write_dynamic_loop_read_vertex,Fail -dEQP-GLES2.functional.shaders.indexing.tmp_array.vec2_const_write_dynamic_read_vertex,Fail -dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_loop_read_vertex,Fail -dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_read_vertex,Fail -dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_loop_read_vertex,Fail -dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_read_vertex,Fail dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_dynamic_loop_write_dynamic_loop_read_vertex,Fail dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_dynamic_loop_write_dynamic_read_vertex,Fail dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_dynamic_loop_write_static_loop_read_vertex,Fail diff --git a/src/gallium/drivers/lima/ir/gp/regalloc.c b/src/gallium/drivers/lima/ir/gp/regalloc.c index de519b210b8..8526d1e9e7d 100644 --- a/src/gallium/drivers/lima/ir/gp/regalloc.c +++ b/src/gallium/drivers/lima/ir/gp/regalloc.c @@ -31,14 +31,7 @@ struct reg_info { BITSET_WORD *conflicts; struct util_dynarray conflict_list; - /* Number of conflicts that must be allocated to physical registers. - */ - unsigned phys_conflicts; - - unsigned node_conflicts; - - /* Number of conflicts that can be allocated to either. */ - unsigned total_conflicts; + unsigned num_conflicts; int assigned_color; @@ -46,7 +39,7 @@ struct reg_info { }; struct regalloc_ctx { - unsigned bitset_words, num_nodes_and_regs; + unsigned bitset_words; struct reg_info *registers; /* Reusable scratch liveness array */ @@ -64,8 +57,8 @@ struct regalloc_ctx { /* Liveness analysis */ -static void propagate_liveness_instr(gpir_node *node, BITSET_WORD *live, - gpir_compiler *comp) +static void propagate_liveness_node(gpir_node *node, BITSET_WORD *live, + gpir_compiler *comp) { /* KILL */ if (node->type == gpir_node_type_store) { @@ -96,7 +89,7 @@ static bool propagate_liveness_block(gpir_block *block, struct regalloc_ctx *ctx memcpy(ctx->live, block->live_out, ctx->bitset_words * sizeof(BITSET_WORD)); list_for_each_entry_rev(gpir_node, node, &block->node_list, list) { - propagate_liveness_instr(node, ctx->live, block->comp); + propagate_liveness_node(node, ctx->live, block->comp); } bool changed = false; @@ -166,18 +159,8 @@ static void add_interference(struct regalloc_ctx *ctx, unsigned i, unsigned j) BITSET_SET(a->conflicts, j); BITSET_SET(b->conflicts, i); - a->total_conflicts++; - b->total_conflicts++; - if (j < ctx->comp->cur_reg) - a->phys_conflicts++; - else - a->node_conflicts++; - - if (i < ctx->comp->cur_reg) - b->phys_conflicts++; - else - b->node_conflicts++; - + a->num_conflicts++; + b->num_conflicts++; util_dynarray_append(&a->conflict_list, unsigned, j); util_dynarray_append(&b->conflict_list, unsigned, i); } @@ -187,24 +170,17 @@ static void add_interference(struct regalloc_ctx *ctx, unsigned i, unsigned j) */ static void add_all_interferences(struct regalloc_ctx *ctx, unsigned i, - BITSET_WORD *live_nodes, BITSET_WORD *live_regs) { - int live_node; - BITSET_FOREACH_SET(live_node, live_nodes, ctx->comp->cur_index) { - add_interference(ctx, i, - live_node + ctx->comp->cur_reg); - } - int live_reg; - BITSET_FOREACH_SET(live_reg, ctx->live, ctx->comp->cur_index) { + BITSET_FOREACH_SET(live_reg, ctx->live, ctx->comp->cur_reg) { add_interference(ctx, i, live_reg); } } static void print_liveness(struct regalloc_ctx *ctx, - BITSET_WORD *live_reg, BITSET_WORD *live_val) + BITSET_WORD *live_reg) { if (!(lima_debug & LIMA_DEBUG_GP)) return; @@ -213,17 +189,11 @@ static void print_liveness(struct regalloc_ctx *ctx, BITSET_FOREACH_SET(live_idx, live_reg, ctx->comp->cur_reg) { printf("reg%d ", live_idx); } - BITSET_FOREACH_SET(live_idx, live_val, ctx->comp->cur_index) { - printf("%d ", live_idx); - } printf("\n"); } static void calc_interference(struct regalloc_ctx *ctx) { - BITSET_WORD *live_nodes = - rzalloc_array(ctx->mem_ctx, BITSET_WORD, ctx->comp->cur_index); - list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { /* Initialize liveness at the end of the block, but exclude values that * definitely aren't defined by the end. This helps out with @@ -247,35 +217,15 @@ static void calc_interference(struct regalloc_ctx *ctx) list_for_each_entry_rev(gpir_node, node, &block->node_list, list) { gpir_debug("processing node %d\n", node->index); - print_liveness(ctx, ctx->live, live_nodes); - if (node->type != gpir_node_type_store && - node->type != gpir_node_type_branch) { - add_all_interferences(ctx, node->index + ctx->comp->cur_reg, - live_nodes, ctx->live); - - /* KILL */ - BITSET_CLEAR(live_nodes, node->index); - } else if (node->op == gpir_op_store_reg) { + print_liveness(ctx, ctx->live); + if (node->op == gpir_op_store_reg) { gpir_store_node *store = gpir_node_to_store(node); - add_all_interferences(ctx, store->reg->index, - live_nodes, ctx->live); + add_all_interferences(ctx, store->reg->index, ctx->live); /* KILL */ BITSET_CLEAR(ctx->live, store->reg->index); - } - - /* GEN */ - if (node->type == gpir_node_type_store) { - gpir_store_node *store = gpir_node_to_store(node); - BITSET_SET(live_nodes, store->child->index); - } else if (node->type == gpir_node_type_alu) { - gpir_alu_node *alu = gpir_node_to_alu(node); - for (int i = 0; i < alu->num_child; i++) - BITSET_SET(live_nodes, alu->children[i]->index); - } else if (node->type == gpir_node_type_branch) { - gpir_branch_node *branch = gpir_node_to_branch(node); - BITSET_SET(live_nodes, branch->cond->index); } else if (node->op == gpir_op_load_reg) { + /* GEN */ gpir_load_node *load = gpir_node_to_load(node); BITSET_SET(ctx->live, load->reg->index); } @@ -288,39 +238,21 @@ static void calc_interference(struct regalloc_ctx *ctx) static bool can_simplify(struct regalloc_ctx *ctx, unsigned i) { struct reg_info *info = &ctx->registers[i]; - if (i < ctx->comp->cur_reg) { - /* Physical regs. */ - return info->phys_conflicts + info->node_conflicts < GPIR_PHYSICAL_REG_NUM; - } else { - /* Nodes: if we manage to allocate all of its conflicting physical - * registers, they will take up at most GPIR_PHYSICAL_REG_NUM colors, so - * we can ignore any more than that. - */ - return MIN2(info->phys_conflicts, GPIR_PHYSICAL_REG_NUM) + - info->node_conflicts < GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM; - } + return info->num_conflicts < GPIR_PHYSICAL_REG_NUM; } static void push_stack(struct regalloc_ctx *ctx, unsigned i) { ctx->stack[ctx->stack_size++] = i; - if (i < ctx->comp->cur_reg) - gpir_debug("pushing reg%u\n", i); - else - gpir_debug("pushing %d\n", i - ctx->comp->cur_reg); + gpir_debug("pushing reg%u\n", i); struct reg_info *info = &ctx->registers[i]; assert(info->visited); util_dynarray_foreach(&info->conflict_list, unsigned, conflict) { struct reg_info *conflict_info = &ctx->registers[*conflict]; - if (i < ctx->comp->cur_reg) { - assert(conflict_info->phys_conflicts > 0); - conflict_info->phys_conflicts--; - } else { - assert(conflict_info->node_conflicts > 0); - conflict_info->node_conflicts--; - } + assert(conflict_info->num_conflicts > 0); + conflict_info->num_conflicts--; if (!ctx->registers[*conflict].visited && can_simplify(ctx, *conflict)) { ctx->worklist[ctx->worklist_end++] = *conflict; ctx->registers[*conflict].visited = true; @@ -335,7 +267,7 @@ static bool do_regalloc(struct regalloc_ctx *ctx) ctx->stack_size = 0; /* Step 1: find the initially simplifiable registers */ - for (int i = 0; i < ctx->comp->cur_reg + ctx->comp->cur_index; i++) { + for (int i = 0; i < ctx->comp->cur_reg; i++) { if (can_simplify(ctx, i)) { ctx->worklist[ctx->worklist_end++] = i; ctx->registers[i].visited = true; @@ -348,7 +280,7 @@ static bool do_regalloc(struct regalloc_ctx *ctx) push_stack(ctx, ctx->worklist[ctx->worklist_start++]); } - if (ctx->stack_size < ctx->num_nodes_and_regs) { + if (ctx->stack_size < ctx->comp->cur_reg) { /* If there are still unsimplifiable nodes left, we need to * optimistically push a node onto the stack. Choose the one with * the smallest number of current neighbors, since that's the most @@ -356,13 +288,13 @@ static bool do_regalloc(struct regalloc_ctx *ctx) */ unsigned min_conflicts = UINT_MAX; unsigned best_reg = 0; - for (unsigned reg = 0; reg < ctx->num_nodes_and_regs; reg++) { + for (int reg = 0; reg < ctx->comp->cur_reg; reg++) { struct reg_info *info = &ctx->registers[reg]; if (info->visited) continue; - if (info->phys_conflicts + info->node_conflicts < min_conflicts) { + if (info->num_conflicts < min_conflicts) { best_reg = reg; - min_conflicts = info->phys_conflicts + info->node_conflicts; + min_conflicts = info->num_conflicts; } } gpir_debug("optimistic triggered\n"); @@ -374,21 +306,14 @@ static bool do_regalloc(struct regalloc_ctx *ctx) } /* Step 4: pop off the stack and assign colors */ - for (int i = ctx->num_nodes_and_regs - 1; i >= 0; i--) { + for (int i = ctx->comp->cur_reg - 1; i >= 0; i--) { unsigned idx = ctx->stack[i]; struct reg_info *reg = &ctx->registers[idx]; - unsigned num_available_regs; - if (idx < ctx->comp->cur_reg) { - num_available_regs = GPIR_PHYSICAL_REG_NUM; - } else { - num_available_regs = GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM; - } - bool found = false; - unsigned start = i % num_available_regs; - for (unsigned j = 0; j < num_available_regs; j++) { - unsigned candidate = (j + start) % num_available_regs; + unsigned start = i % GPIR_PHYSICAL_REG_NUM; + for (unsigned j = 0; j < GPIR_PHYSICAL_REG_NUM; j++) { + unsigned candidate = (j + start) % GPIR_PHYSICAL_REG_NUM; bool available = true; util_dynarray_foreach(®->conflict_list, unsigned, conflict_idx) { struct reg_info *conflict = &ctx->registers[*conflict_idx]; @@ -420,11 +345,6 @@ static void assign_regs(struct regalloc_ctx *ctx) { list_for_each_entry(gpir_block, block, &ctx->comp->block_list, list) { list_for_each_entry(gpir_node, node, &block->node_list, list) { - if (node->index >= 0) { - node->value_reg = - ctx->registers[ctx->comp->cur_reg + node->index].assigned_color; - } - if (node->op == gpir_op_load_reg) { gpir_load_node *load = gpir_node_to_load(node); unsigned color = ctx->registers[load->reg->index].assigned_color; @@ -452,6 +372,195 @@ static void assign_regs(struct regalloc_ctx *ctx) } } +/* Value register allocation */ + +/* Define a special token for when the register is occupied by a preallocated + * physical register (i.e. load_reg/store_reg). Normally entries in the "live" + * array points to the definition of the value, but there may be multiple + * definitions in this case, and they will certainly come from other basic + * blocks, so it doesn't make sense to do that here. + */ +static gpir_node __physreg_live; +#define PHYSREG_LIVE (&__physreg_live) + +struct value_regalloc_ctx { + gpir_node *last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM]; + gpir_node *complex1_last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM]; + gpir_node *live[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM]; + gpir_node *last_complex1; + unsigned alloc_start; +}; + +static unsigned find_free_value_reg(struct value_regalloc_ctx *ctx) +{ + /* Implement round-robin allocation */ + unsigned reg_offset = ctx->alloc_start++; + if (ctx->alloc_start == GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM) + ctx->alloc_start = 0; + + unsigned reg = UINT_MAX; + for (unsigned reg_base = 0; + reg_base < GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM; + reg_base++) { + unsigned cur_reg = (reg_base + reg_offset) % (GPIR_PHYSICAL_REG_NUM + GPIR_VALUE_REG_NUM); + if (!ctx->live[cur_reg]) { + reg = cur_reg; + break; + } + } + + return reg; +} + +static void add_fake_dep(gpir_node *node, gpir_node *src, + struct value_regalloc_ctx *ctx) +{ + assert(src->value_reg >= 0); + if (ctx->last_written[src->value_reg] && + ctx->last_written[src->value_reg] != node) { + gpir_node_add_dep(ctx->last_written[src->value_reg], node, + GPIR_DEP_WRITE_AFTER_READ); + } + + /* For a sequence of schedule_first nodes right before a complex1 + * node, add any extra fake dependencies necessary so that the + * schedule_first nodes can be scheduled right after the complex1 is + * scheduled. We have to save the last_written before complex1 happens to + * avoid adding dependencies to children of the complex1 node which would + * create a cycle. + */ + + if (gpir_op_infos[node->op].schedule_first && + ctx->last_complex1 && + ctx->complex1_last_written[src->value_reg]) { + gpir_node_add_dep(ctx->complex1_last_written[src->value_reg], + ctx->last_complex1, + GPIR_DEP_WRITE_AFTER_READ); + } +} + +static bool handle_value_read(gpir_node *node, gpir_node *src, + struct value_regalloc_ctx *ctx) +{ + /* If already allocated, don't allocate it */ + if (src->value_reg < 0) { + unsigned reg = find_free_value_reg(ctx); + if (reg == UINT_MAX) + return false; + + src->value_reg = reg; + ctx->live[reg] = src; + } + + /* Add any fake dependencies. Note: this is the actual result of value + * register allocation. We throw away node->value_reg afterwards, since + * it's really the fake dependencies which constrain the post-RA scheduler + * enough to make sure it never needs to spill to temporaries. + */ + add_fake_dep(node, src, ctx); + + return true; +} + +static bool handle_reg_read(gpir_load_node *load, + struct value_regalloc_ctx *ctx) +{ + unsigned idx = load->index * 4 + load->component; + if (!ctx->live[idx]) { + ctx->live[idx] = PHYSREG_LIVE; + } else if (ctx->live[idx] != PHYSREG_LIVE) { + /* This slot is occupied by some other value register, so we need to + * evict it. This effectively splits the live range of the value + * register. NB: since we create fake dependencies on the fly, and the + * fake dependencies are the only output of this pass, we don't actually + * have to record where the split happened or that this value was + * assigned to two different registers. Any actual live range splitting + * happens in the post-RA scheduler, which moves the value to and from + * the register file. This will just cause some reads of the value + * register to have different fake dependencies. + */ + unsigned new_reg = find_free_value_reg(ctx); + if (new_reg == UINT_MAX) + return false; + ctx->live[new_reg] = ctx->live[idx]; + ctx->live[new_reg]->value_reg = new_reg; + ctx->live[idx] = PHYSREG_LIVE; + } + + if (ctx->last_written[idx]) { + gpir_node_add_dep(ctx->last_written[idx], &load->node, + GPIR_DEP_WRITE_AFTER_READ); + } + + return true; +} + +static void handle_reg_write(gpir_store_node *store, + struct value_regalloc_ctx *ctx) +{ + unsigned idx = store->index * 4 + store->component; + store->node.value_reg = idx; + ctx->last_written[idx] = &store->node; + ctx->live[idx] = NULL; +} + +static void handle_value_write(gpir_node *node, + struct value_regalloc_ctx *ctx) +{ + ctx->last_written[node->value_reg] = node; + ctx->live[node->value_reg] = NULL; +} + +static bool regalloc_value_regs(gpir_block *block) +{ + struct value_regalloc_ctx ctx = { { 0 } }; + + list_for_each_entry(gpir_node, node, &block->node_list, list) { + node->value_reg = -1; + } + + list_for_each_entry_rev(gpir_node, node, &block->node_list, list) { + if (node->op == gpir_op_complex1) { + ctx.last_complex1 = node; + memcpy(ctx.complex1_last_written, ctx.last_written, + sizeof(ctx.complex1_last_written)); + } + + if (node->type != gpir_node_type_store && + node->type != gpir_node_type_branch) { + handle_value_write(node, &ctx); + } else if (node->op == gpir_op_store_reg) { + handle_reg_write(gpir_node_to_store(node), &ctx); + } + + if (node->type == gpir_node_type_store) { + gpir_store_node *store = gpir_node_to_store(node); + if (!handle_value_read(&store->node, store->child, &ctx)) + return false; + } else if (node->type == gpir_node_type_alu) { + gpir_alu_node *alu = gpir_node_to_alu(node); + for (int i = 0; i < alu->num_child; i++) { + if (!handle_value_read(&alu->node, alu->children[i], &ctx)) + return false; + } + } else if (node->type == gpir_node_type_branch) { + /* At the end of a block the top 11 values are always free, so + * branches should always succeed. + */ + gpir_branch_node *branch = gpir_node_to_branch(node); + ASSERTED bool result = handle_value_read(&branch->node, + branch->cond, &ctx); + assert(result); + } else if (node->op == gpir_op_load_reg) { + gpir_load_node *load = gpir_node_to_load(node); + if (!handle_reg_read(load, &ctx)) + return false; + } + } + + return true; +} + static void regalloc_print_result(gpir_compiler *comp) { if (!(lima_debug & LIMA_DEBUG_GP)) @@ -486,15 +595,14 @@ bool gpir_regalloc_prog(gpir_compiler *comp) struct regalloc_ctx ctx; ctx.mem_ctx = ralloc_context(NULL); - ctx.num_nodes_and_regs = comp->cur_reg + comp->cur_index; - ctx.bitset_words = BITSET_WORDS(ctx.num_nodes_and_regs); + ctx.bitset_words = BITSET_WORDS(comp->cur_reg); ctx.live = ralloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); - ctx.worklist = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs); - ctx.stack = ralloc_array(ctx.mem_ctx, unsigned, ctx.num_nodes_and_regs); + ctx.worklist = ralloc_array(ctx.mem_ctx, unsigned, comp->cur_reg); + ctx.stack = ralloc_array(ctx.mem_ctx, unsigned, comp->cur_reg); ctx.comp = comp; - ctx.registers = rzalloc_array(ctx.mem_ctx, struct reg_info, ctx.num_nodes_and_regs); - for (unsigned i = 0; i < ctx.num_nodes_and_regs; i++) { + ctx.registers = rzalloc_array(ctx.mem_ctx, struct reg_info, comp->cur_reg); + for (int i = 0; i < comp->cur_reg; i++) { ctx.registers[i].conflicts = rzalloc_array(ctx.mem_ctx, BITSET_WORD, ctx.bitset_words); util_dynarray_init(&ctx.registers[i].conflict_list, ctx.mem_ctx); @@ -514,6 +622,11 @@ bool gpir_regalloc_prog(gpir_compiler *comp) } assign_regs(&ctx); + list_for_each_entry(gpir_block, block, &comp->block_list, list) { + if (!regalloc_value_regs(block)) + return false; + } + regalloc_print_result(comp); ralloc_free(ctx.mem_ctx); return true; diff --git a/src/gallium/drivers/lima/ir/gp/scheduler.c b/src/gallium/drivers/lima/ir/gp/scheduler.c index 90a830f056e..78128bd891e 100644 --- a/src/gallium/drivers/lima/ir/gp/scheduler.c +++ b/src/gallium/drivers/lima/ir/gp/scheduler.c @@ -1612,33 +1612,8 @@ static bool schedule_block(gpir_block *block) return true; } -static void add_fake_dep(gpir_node *node, gpir_node *dep_node, - gpir_node *last_written[]) -{ - gpir_node_foreach_pred(node, dep) { - if (dep->type == GPIR_DEP_INPUT) { - int index = dep->pred->value_reg; - if (index >= 0 && last_written[index]) { - gpir_node_add_dep(last_written[index], dep_node, - GPIR_DEP_WRITE_AFTER_READ); - } - if (gpir_op_infos[dep->pred->op].schedule_first) { - /* Insert fake dependencies for any schedule_first children on - * this node as well. This guarantees that as soon as - * "dep_node" is ready to schedule, all of its schedule_first - * children, grandchildren, etc. are ready so that they can be - * scheduled as soon as possible. - */ - add_fake_dep(dep->pred, dep_node, last_written); - } - } - } -} - static void schedule_build_dependency(gpir_block *block) { - gpir_node *last_written[GPIR_VALUE_REG_NUM + GPIR_PHYSICAL_REG_NUM] = {0}; - /* merge dummy_f/m to the node created from */ list_for_each_entry_safe(gpir_node, node, &block->node_list, list) { if (node->op == gpir_op_dummy_m) { @@ -1658,30 +1633,6 @@ static void schedule_build_dependency(gpir_block *block) gpir_node_delete(node); } } - - memset(last_written, 0, sizeof(last_written)); - - /* False dependencies. For value registers, these exist only to make sure - * that the maximum pressure isn't exceeded and are hence "fake". - */ - list_for_each_entry_rev(gpir_node, node, &block->node_list, list) { - if (node->op == gpir_op_load_reg) { - gpir_load_node *load = gpir_node_to_load(node); - unsigned index = 4 * load->index + load->component; - if (last_written[index]) { - gpir_node_add_dep(last_written[index], node, GPIR_DEP_WRITE_AFTER_READ); - } - } else if (node->op == gpir_op_store_reg) { - gpir_store_node *store = gpir_node_to_store(node); - unsigned index = 4 * store->index + store->component; - last_written[index] = node; - } else { - add_fake_dep(node, node, last_written); - } - - if (node->value_reg >= 0) - last_written[node->value_reg] = node; - } } static void print_statistic(gpir_compiler *comp, int save_index)