From 54f6e72b27154219be237690453bb9c3b57cd3ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Ondra=C4=8Dka?= Date: Tue, 25 Oct 2022 21:41:48 +0200 Subject: [PATCH] r300: better register allocator for vertex shaders MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a port of the fp pair regalloc. It is however much simpler as contrary to the fp, we don't have to care about texturing, we can use any swizzle we want and we don't have to track the inputs. Using the register class machinery might actually be a slight overkill right now, however the infrastructure will hopefully come in handy if someone decides to implement the vp pair scheduling eventually. Shader-db stats: RV530: total temps in shared programs: 18594 -> 17000 (-8.57%) temps in affected programs: 5753 -> 4159 (-27.71%) helped: 665 HURT: 0 RV370: total temps in shared programs: 13555 -> 12181 (-10.14%) temps in affected programs: 5116 -> 3742 (-26.86%) helped: 633 HURT: 0 Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/5972 Signed-off-by: Pavel Ondračka Reviewed-by: Filip Gawin Tested-by: Filip Gawin Part-of: --- .../drivers/r300/compiler/r3xx_vertprog.c | 112 ++++++++---------- src/gallium/drivers/r300/r300_context.c | 1 + src/gallium/drivers/r300/r300_context.h | 2 + src/gallium/drivers/r300/r300_vs.c | 2 +- 4 files changed, 52 insertions(+), 65 deletions(-) diff --git a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c index 3814698047d..a923b80bb0a 100644 --- a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c +++ b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c @@ -34,6 +34,8 @@ #include "radeon_swizzle.h" #include "radeon_emulate_branches.h" #include "radeon_remove_constants.h" +#include "radeon_regalloc.h" +#include "radeon_list.h" #include "util/compiler.h" @@ -610,82 +612,64 @@ static int get_reg(struct radeon_compiler *c, struct temporary_allocation *ta, b static void allocate_temporary_registers(struct radeon_compiler *c, void *user) { - struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c; - struct rc_instruction *inst; - struct rc_instruction *end_loop = NULL; - unsigned int num_orig_temps = 0; - bool hwtemps[RC_REGISTER_MAX_INDEX]; - struct temporary_allocation * ta; - unsigned int i; - - memset(hwtemps, 0, sizeof(hwtemps)); + unsigned int node_count, node_index; + struct ra_class ** node_classes; + struct rc_list * var_ptr; + struct rc_list * variables; + struct ra_graph * graph; + const struct rc_regalloc_state *ra_state = c->regalloc_state; rc_recompute_ips(c); - /* Pass 1: Count original temporaries. */ - for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { - const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); + /* Get list of program variables */ + variables = rc_get_variables(c); + node_count = rc_list_count(variables); + node_classes = memory_pool_malloc(&c->Pool, + node_count * sizeof(struct ra_class *)); - for (i = 0; i < opcode->NumSrcRegs; ++i) { - if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) { - if (inst->U.I.SrcReg[i].Index >= num_orig_temps) - num_orig_temps = inst->U.I.SrcReg[i].Index + 1; - } - } - - if (opcode->HasDstReg) { - if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) { - if (inst->U.I.DstReg.Index >= num_orig_temps) - num_orig_temps = inst->U.I.DstReg.Index + 1; - } + for (var_ptr = variables, node_index = 0; var_ptr; + var_ptr = var_ptr->Next, node_index++) { + unsigned int class_index = 0; + int index; + /* Compute the live intervals */ + rc_variable_compute_live_intervals(var_ptr->Item); + unsigned int writemask = rc_variable_writemask_sum(var_ptr->Item); + index = rc_find_class(c->regalloc_state->class_list, writemask, 6); + if (index > -1) { + class_index = c->regalloc_state->class_list[index].ID; + } else { + rc_error(c, + "Could not find class for index=%u mask=%u\n", + ((struct rc_variable *)var_ptr->Item)->Dst.Index, writemask); } + node_classes[node_index] = ra_state->classes[class_index]; } - ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool, - sizeof(struct temporary_allocation) * num_orig_temps); - memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps); + graph = ra_alloc_interference_graph(ra_state->regs, node_count); - /* Pass 2: Determine original temporary lifetimes */ - for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { - const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); - /* Instructions inside of loops need to use the ENDLOOP - * instruction as their LastRead. */ - if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) - end_loop = rc_match_bgnloop(inst); - - if (inst == end_loop) { - end_loop = NULL; - continue; - } - - for (i = 0; i < opcode->NumSrcRegs; ++i) { - if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) { - ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst; - } - } + for (node_index = 0; node_index < node_count; node_index++) { + ra_set_node_class(graph, node_index, node_classes[node_index]); } - /* Pass 3: Register allocation */ - for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) { - const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode); + rc_build_interference_graph(graph, variables); - for (i = 0; i < opcode->NumSrcRegs; ++i) { - if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) { - unsigned int orig = inst->U.I.SrcReg[i].Index; - inst->U.I.SrcReg[i].Index = get_reg(c, ta, hwtemps, orig); - - if (ta[orig].Allocated && inst == ta[orig].LastRead) - hwtemps[ta[orig].HwTemp] = false; - } - } - - if (opcode->HasDstReg) { - if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) { - unsigned int orig = inst->U.I.DstReg.Index; - inst->U.I.DstReg.Index = get_reg(c, ta, hwtemps, orig); - } - } + if (!ra_allocate(graph)) { + rc_error(c, "Ran out of hardware temporaries\n"); + return; } + + /* Rewrite the registers */ + for (var_ptr = variables, node_index = 0; var_ptr; + var_ptr = var_ptr->Next, node_index++) { + int reg = ra_get_node_reg(graph, node_index); + unsigned int writemask = reg_get_writemask(reg); + unsigned int index = reg_get_index(reg); + struct rc_variable * var = var_ptr->Item; + + rc_variable_change_dst(var, index, writemask); + } + + ralloc_free(graph); } /** diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c index 0aa61de1b5a..9ca39d5dbbf 100644 --- a/src/gallium/drivers/r300/r300_context.c +++ b/src/gallium/drivers/r300/r300_context.c @@ -491,6 +491,7 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen, /* Register allocator state */ rc_init_regalloc_state(&r300->fs_regalloc_state, RC_FRAGMENT_PROGRAM); + rc_init_regalloc_state(&r300->vs_regalloc_state, RC_VERTEX_PROGRAM); /* Print driver info. */ #ifdef DEBUG diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h index a8228801dff..4745dd68e0d 100644 --- a/src/gallium/drivers/r300/r300_context.h +++ b/src/gallium/drivers/r300/r300_context.h @@ -637,6 +637,8 @@ struct r300_context { /* Compiler state. */ struct rc_regalloc_state fs_regalloc_state; /* Register allocator info for * fragment shaders. */ + struct rc_regalloc_state vs_regalloc_state; /* Register allocator info for + * vertex shaders. */ }; #define foreach_atom(r300, atom) \ diff --git a/src/gallium/drivers/r300/r300_vs.c b/src/gallium/drivers/r300/r300_vs.c index 2ffaf629d23..76413b0a040 100644 --- a/src/gallium/drivers/r300/r300_vs.c +++ b/src/gallium/drivers/r300/r300_vs.c @@ -190,7 +190,7 @@ void r300_translate_vertex_shader(struct r300_context *r300, /* Setup the compiler */ memset(&compiler, 0, sizeof(compiler)); - rc_init(&compiler.Base, NULL); + rc_init(&compiler.Base, &r300->vs_regalloc_state); DBG_ON(r300, DBG_VP) ? compiler.Base.Debug |= RC_DBG_LOG : 0; compiler.code = &vs->code;