panfrost/midgard: Allocate registers once (per-screen)

This should save a lot of per-compile time by using the RA the way it's actually supposed to be used. Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
2019-07-23 07:59:00 -07:00
parent 772a5f9814
commit 840b806d64
7 changed files with 86 additions and 19 deletions
@@ -67,7 +67,7 @@ panfrost_shader_compile(struct panfrost_context *ctx, struct mali_shader_meta *m
                .alpha_ref = state->alpha_state.ref_value
        };

-        midgard_compile_shader_nir(s, &program, false);
+        midgard_compile_shader_nir(&ctx->compiler, s, &program, false);

        /* Prepare the compiled binary for upload */
        int size = program.compiled.size;
@@ -170,7 +170,7 @@ panfrost_compile_blend_shader(
        /* Compile the built shader */

        midgard_program program;
-        midgard_compile_shader_nir(shader, &program, true);
+        midgard_compile_shader_nir(&ctx->compiler, shader, &program, true);

        /* Upload the shader */

@@ -91,6 +91,9 @@ struct panfrost_context {
        /* Gallium context */
        struct pipe_context base;

+        /* Compiler context */
+        struct midgard_screen compiler;
+
        /* Bound job and map of panfrost_job_key to jobs */
        struct panfrost_job *job;
        struct hash_table *jobs;
@@ -188,6 +188,9 @@ typedef struct compiler_context {
        nir_shader *nir;
        gl_shader_stage stage;

+        /* The screen we correspond to */
+        struct midgard_screen *screen;
+
        /* Is internally a blend shader? Depends on stage == FRAGMENT */
        bool is_blend;

@@ -2395,7 +2395,7 @@ midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx)
 }

 int
-midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend)
+midgard_compile_shader_nir(struct midgard_screen *screen, nir_shader *nir, midgard_program *program, bool is_blend)
 {
        struct util_dynarray *compiled = &program->compiled;

@@ -2403,6 +2403,7 @@ midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_bl

        compiler_context ictx = {
                .nir = nir,
+                .screen = screen,
                .stage = nir->info.stage,

                .is_blend = is_blend,
@@ -26,6 +26,24 @@

 #include "compiler/nir/nir.h"
 #include "util/u_dynarray.h"
+#include "util/register_allocate.h"
+
+/* To be shoved inside panfrost_screen for the Gallium driver, or somewhere
+ * else for Vulkan/standalone. The single compiler "screen" to be shared across
+ * all shader compiles, used to store complex initialization (for instance,
+ * related to register allocation) */
+
+struct midgard_screen {
+        /* Precomputed register allocation sets for varying numbers of work
+         * registers.  The zeroeth entry corresponds to 8 work registers. The
+         * eighth entry corresponds to 16 work registers. NULL if this set has
+         * not been allocated yet. */
+
+        struct ra_regs *regs[9];
+
+        /* Work register classes corresponds to the above register sets */
+        unsigned reg_classes[9][4];
+};

 /* Define the general compiler entry point */

@@ -92,7 +110,7 @@ typedef struct {
 } midgard_program;

 int
-midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend);
+midgard_compile_shader_nir(struct midgard_screen *screen, nir_shader *nir, midgard_program *program, bool is_blend);

 /* NIR options are shared between the standalone compiler and the online
 * compiler. Defining it here is the simplest, though maybe not the Right
@@ -157,17 +157,12 @@ index_to_reg(compiler_context *ctx, struct ra_graph *g, int reg)
        return r;
 }

-/* This routine performs the actual register allocation. It should be succeeded
- * by install_registers */
+/* This routine creates a register set. Should be called infrequently since
+ * it's slow and can be cached */

-struct ra_graph *
-allocate_registers(compiler_context *ctx, bool *spilled)
+static struct ra_regs *
+create_register_set(unsigned work_count, unsigned *classes)
 {
-        /* The number of vec4 work registers available depends on when the
-         * uniforms start, so compute that first */
-
-        int work_count = 16 - MAX2((ctx->uniform_cutoff - 8), 0);
-
        int virtual_count = work_count * WORK_STRIDE;

        /* First, initialize the RA */
@@ -178,12 +173,10 @@ allocate_registers(compiler_context *ctx, bool *spilled)
        int work_vec2 = ra_alloc_reg_class(regs);
        int work_vec1 = ra_alloc_reg_class(regs);

-        unsigned classes[4] = {
-                work_vec1,
-                work_vec2,
-                work_vec3,
-                work_vec4
-        };
+        classes[0] = work_vec1;
+        classes[1] = work_vec2;
+        classes[2] = work_vec3;
+        classes[3] = work_vec4;

        /* Add the full set of work registers */
        for (unsigned i = 0; i < work_count; ++i) {
@@ -217,6 +210,55 @@ allocate_registers(compiler_context *ctx, bool *spilled)
        /* We're done setting up */
        ra_set_finalize(regs, NULL);

+        return regs;
+}
+
+/* This routine gets a precomputed register set off the screen if it's able, or otherwise it computes one on the fly */
+
+static struct ra_regs *
+get_register_set(struct midgard_screen *screen, unsigned work_count, unsigned **classes)
+{
+        /* Bounds check */
+        assert(work_count >= 8);
+        assert(work_count <= 16);
+
+        /* Compute index */
+        unsigned index = work_count - 8;
+
+        /* Find the reg set */
+        struct ra_regs *cached = screen->regs[index];
+
+        if (cached) {
+                assert(screen->reg_classes[index]);
+                *classes = screen->reg_classes[index];
+                return cached;
+        }
+
+        /* Otherwise, create one */
+        struct ra_regs *created = create_register_set(work_count, screen->reg_classes[index]);
+
+        /* Cache it and use it */
+        screen->regs[index] = created;
+
+        *classes = screen->reg_classes[index];
+        return created;
+}
+
+/* This routine performs the actual register allocation. It should be succeeded
+ * by install_registers */
+
+struct ra_graph *
+allocate_registers(compiler_context *ctx, bool *spilled)
+{
+        /* The number of vec4 work registers available depends on when the
+         * uniforms start, so compute that first */
+        int work_count = 16 - MAX2((ctx->uniform_cutoff - 8), 0);
+        unsigned *classes = NULL;
+        struct ra_regs *regs = get_register_set(ctx->screen, work_count, &classes);
+
+        assert(regs != NULL);
+        assert(classes != NULL);
+
       /* No register allocation to do with no SSA */

        if (!ctx->temp_count)