From e3b746e3a31e350e9f0962717e49acba28efee30 Mon Sep 17 00:00:00 2001
From: Mike Blumenkrantz <michael.blumenkrantz@gmail.com>
Date: Tue, 7 Feb 2023 13:32:21 -0500
Subject: [PATCH] zink: use GPL to handle (simple) separate shader objects

apps/games using separate shader objects end up passing the separable
shaders to the link_shader hook individually, which is still not ideal for
zink's usage since the more optimal path is to have all the shaders and create
a RAST+FS GPL stage that can run all the inter-stage io handlers

it IS technically possible to handle this for simple VS+FS pipelines using
GPL, however, but it's kinda gross. such shaders now use descriptor buffer
to create their own pipelines/layouts/descriptors async, and then a "separable"
variant of the gfx program can be created by fast-linking these together

the "separable" gfx program can't handle shader variants, but it can do basic
pipeline caching for PSO state changes, which makes it flexible enough to sorta
kinda maybe handle the most basic cases of separate shader objects

descriptor buffer is used because having to create and manage a separate architecture
for sets/pools/templates is too nightmarish even for me

this is, at best, a partial solution, but it's the best the vulkan api can
currently do

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21197>
---
 src/gallium/drivers/zink/zink_compiler.c      |  44 ++++
 src/gallium/drivers/zink/zink_compiler.h      |   2 +
 src/gallium/drivers/zink/zink_descriptors.c   | 166 +++++++++++++++
 src/gallium/drivers/zink/zink_descriptors.h   |   8 +-
 src/gallium/drivers/zink/zink_pipeline.c      |  11 +-
 src/gallium/drivers/zink/zink_pipeline.h      |   2 +
 src/gallium/drivers/zink/zink_program.c       | 190 +++++++++++++++++-
 .../drivers/zink/zink_program_state.hpp       |   8 +-
 src/gallium/drivers/zink/zink_screen.c        |   2 +
 src/gallium/drivers/zink/zink_types.h         |  39 +++-
 10 files changed, 447 insertions(+), 25 deletions(-)

diff --git a/src/gallium/drivers/zink/zink_compiler.c b/src/gallium/drivers/zink/zink_compiler.c
index c8d22f1f076..2352fe17aa4 100644
--- a/src/gallium/drivers/zink/zink_compiler.c
+++ b/src/gallium/drivers/zink/zink_compiler.c
@@ -24,6 +24,7 @@
 #include "nir_opcodes.h"
 #include "zink_context.h"
 #include "zink_compiler.h"
+#include "zink_descriptors.h"
 #include "zink_program.h"
 #include "zink_screen.h"
 #include "nir_to_spirv/nir_to_spirv.h"
@@ -3205,6 +3206,39 @@ zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs,
    return mod;
 }
 
+VkShaderModule
+zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs, nir_shader **ret_nir)
+{
+   nir_shader *nir = nir_shader_clone(NULL, zs->nir);
+   int set = nir->info.stage == MESA_SHADER_FRAGMENT;
+   unsigned offsets[4];
+   zink_descriptor_shader_get_binding_offsets(zs, offsets);
+   nir_foreach_variable_with_modes(var, nir, nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_uniform | nir_var_image) {
+      if (var->data.bindless)
+         continue;
+      var->data.descriptor_set = set;
+      switch (var->data.mode) {
+      case nir_var_mem_ubo:
+            var->data.binding = !!var->data.driver_location;
+            break;
+      case nir_var_uniform:
+         if (glsl_type_is_sampler(glsl_without_array(var->type)))
+            var->data.binding += offsets[1];
+         break;
+      case nir_var_mem_ssbo:
+         var->data.binding += offsets[2];
+         break;
+      case nir_var_image:
+         var->data.binding += offsets[3];
+         break;
+      default: break;
+      }
+   }
+   optimize_nir(nir, zs);
+   *ret_nir = nir;
+   return compile_module(screen, zs, nir);
+}
+
 static bool
 lower_baseinstance_instr(nir_builder *b, nir_instr *instr, void *data)
 {
@@ -4196,6 +4230,7 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,
 
    ret->sinfo.have_vulkan_memory_model = screen->info.have_KHR_vulkan_memory_model;
 
+   util_queue_fence_init(&ret->precompile.fence);
    ret->hash = _mesa_hash_pointer(ret);
 
    ret->programs = _mesa_pointer_set_create(NULL);
@@ -4490,8 +4525,16 @@ zink_shader_free(struct zink_screen *screen, struct zink_shader *shader)
       shader->non_fs.generated_gs = NULL;
    }
    _mesa_set_destroy(shader->programs, NULL);
+   util_queue_fence_wait(&shader->precompile.fence);
+   util_queue_fence_destroy(&shader->precompile.fence);
+   zink_descriptor_shader_deinit(screen, shader);
+   if (shader->precompile.mod)
+      VKSCR(DestroyShaderModule)(screen->dev, shader->precompile.mod, NULL);
+   if (shader->precompile.gpl)
+      VKSCR(DestroyPipeline)(screen->dev, shader->precompile.gpl, NULL);
    ralloc_free(shader->nir);
    ralloc_free(shader->spirv);
+   free(shader->precompile.bindings);
    ralloc_free(shader);
 }
 
@@ -4530,6 +4573,7 @@ struct zink_shader *
 zink_shader_tcs_create(struct zink_screen *screen, struct zink_shader *vs, unsigned vertices_per_patch)
 {
    struct zink_shader *ret = rzalloc(NULL, struct zink_shader);
+   util_queue_fence_init(&ret->precompile.fence);
    ret->hash = _mesa_hash_pointer(ret);
    ret->programs = _mesa_pointer_set_create(NULL);
    simple_mtx_init(&ret->lock, mtx_plain);
diff --git a/src/gallium/drivers/zink/zink_compiler.h b/src/gallium/drivers/zink/zink_compiler.h
index a1c894d3853..30a3111e68f 100644
--- a/src/gallium/drivers/zink/zink_compiler.h
+++ b/src/gallium/drivers/zink/zink_compiler.h
@@ -63,6 +63,8 @@ zink_compiler_assign_io(struct zink_screen *screen, nir_shader *producer, nir_sh
 VkShaderModule
 zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs, nir_shader *nir, const struct zink_shader_key *key, const void *extra_data);
 VkShaderModule
+zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs, nir_shader **ret_nir);
+VkShaderModule
 zink_shader_spirv_compile(struct zink_screen *screen, struct zink_shader *zs, struct spirv_shader *spirv);
 struct zink_shader *
 zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,
diff --git a/src/gallium/drivers/zink/zink_descriptors.c b/src/gallium/drivers/zink/zink_descriptors.c
index cc3519d65e1..68f0ef07fe0 100644
--- a/src/gallium/drivers/zink/zink_descriptors.c
+++ b/src/gallium/drivers/zink/zink_descriptors.c
@@ -670,6 +670,96 @@ zink_descriptor_program_init(struct zink_context *ctx, struct zink_program *pg)
    return true;
 }
 
+void
+zink_descriptor_shader_get_binding_offsets(const struct zink_shader *shader, unsigned *offsets)
+{
+   offsets[ZINK_DESCRIPTOR_TYPE_UBO] = 0;
+   offsets[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW] = shader->bindings[ZINK_DESCRIPTOR_TYPE_UBO][shader->num_bindings[ZINK_DESCRIPTOR_TYPE_UBO] - 1].binding + 1;
+   offsets[ZINK_DESCRIPTOR_TYPE_SSBO] = offsets[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW] + shader->bindings[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW][shader->num_bindings[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW] - 1].binding + 1;
+   offsets[ZINK_DESCRIPTOR_TYPE_IMAGE] = offsets[ZINK_DESCRIPTOR_TYPE_SSBO] + shader->bindings[ZINK_DESCRIPTOR_TYPE_SSBO][shader->num_bindings[ZINK_DESCRIPTOR_TYPE_SSBO] - 1].binding + 1;
+}
+
+void
+zink_descriptor_shader_init(struct zink_screen *screen, struct zink_shader *shader)
+{
+   VkDescriptorSetLayoutBinding bindings[ZINK_DESCRIPTOR_BASE_TYPES * ZINK_MAX_DESCRIPTORS_PER_TYPE];
+   unsigned num_bindings = 0;
+   VkShaderStageFlagBits stage_flags = mesa_to_vk_shader_stage(shader->nir->info.stage);
+
+   unsigned desc_set_size = shader->has_uniforms;
+   for (unsigned i = 0; i < ZINK_DESCRIPTOR_BASE_TYPES; i++)
+      desc_set_size += shader->num_bindings[i];
+   if (desc_set_size)
+      shader->precompile.db_template = rzalloc_array(shader, struct zink_descriptor_template, desc_set_size);
+
+   if (shader->has_uniforms) {
+      VkDescriptorSetLayoutBinding *binding = &bindings[num_bindings];
+      binding->binding = 0;
+      binding->descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+      binding->descriptorCount = 1;
+      binding->stageFlags = stage_flags;
+      binding->pImmutableSamplers = NULL;
+      struct zink_descriptor_template *entry = &shader->precompile.db_template[num_bindings];
+      entry->count = 1;
+      entry->offset = offsetof(struct zink_context, di.db.ubos[shader->nir->info.stage][0]);
+      entry->stride = sizeof(VkDescriptorAddressInfoEXT);
+      entry->db_size = screen->info.db_props.robustUniformBufferDescriptorSize;
+      num_bindings++;
+   }
+   /* sync with zink_shader_compile_separate() */
+   unsigned offsets[4];
+   zink_descriptor_shader_get_binding_offsets(shader, offsets);
+   for (int j = 0; j < ZINK_DESCRIPTOR_BASE_TYPES; j++) {
+      for (int k = 0; k < shader->num_bindings[j]; k++) {
+         VkDescriptorSetLayoutBinding *binding = &bindings[num_bindings];
+         if (j == ZINK_DESCRIPTOR_TYPE_UBO)
+            binding->binding = 1;
+         else
+            binding->binding = shader->bindings[j][k].binding + offsets[j];
+         binding->descriptorType = shader->bindings[j][k].type;
+         binding->descriptorCount = shader->bindings[j][k].size;
+         binding->stageFlags = stage_flags;
+         binding->pImmutableSamplers = NULL;
+
+         unsigned temp = 0;
+         init_db_template_entry(screen, shader, j, k, &shader->precompile.db_template[num_bindings], &temp);
+         num_bindings++;
+      }
+   }
+   if (num_bindings) {
+      shader->precompile.dsl = descriptor_layout_create(screen, 0, bindings, num_bindings);
+      shader->precompile.bindings = mem_dup(bindings, num_bindings * sizeof(VkDescriptorSetLayoutBinding));
+      shader->precompile.num_bindings = num_bindings;
+      VkDeviceSize val;
+      VKSCR(GetDescriptorSetLayoutSizeEXT)(screen->dev, shader->precompile.dsl, &val);
+      shader->precompile.db_size = val;
+      shader->precompile.db_offset = rzalloc_array(shader, uint32_t, num_bindings);
+      for (unsigned i = 0; i < num_bindings; i++) {
+         VKSCR(GetDescriptorSetLayoutBindingOffsetEXT)(screen->dev, shader->precompile.dsl, bindings[i].binding, &val);
+         shader->precompile.db_offset[i] = val;
+      }
+   }
+   VkDescriptorSetLayout dsl[ZINK_DESCRIPTOR_ALL_TYPES] = {0};
+   unsigned num_dsl = num_bindings ? 2 : 0;
+   if (shader->bindless)
+      num_dsl = screen->compact_descriptors ? ZINK_DESCRIPTOR_ALL_TYPES - ZINK_DESCRIPTOR_COMPACT : ZINK_DESCRIPTOR_ALL_TYPES;
+   if (num_bindings || shader->bindless) {
+      dsl[shader->nir->info.stage == MESA_SHADER_FRAGMENT] = shader->precompile.dsl;
+      if (shader->bindless)
+         dsl[screen->desc_set_id[ZINK_DESCRIPTOR_BINDLESS]] = screen->bindless_layout;
+   }
+   shader->precompile.layout = zink_pipeline_layout_create(screen, dsl, num_dsl, false, VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT);
+}
+
+void
+zink_descriptor_shader_deinit(struct zink_screen *screen, struct zink_shader *shader)
+{
+   if (shader->precompile.dsl)
+      VKSCR(DestroyDescriptorSetLayout)(screen->dev, shader->precompile.dsl, NULL);
+   if (shader->precompile.layout)
+      VKSCR(DestroyPipelineLayout)(screen->dev, shader->precompile.layout, NULL);
+}
+
 /* called during program destroy */
 void
 zink_descriptor_program_deinit(struct zink_screen *screen, struct zink_program *pg)
@@ -946,6 +1036,71 @@ populate_sets(struct zink_context *ctx, struct zink_batch_state *bs,
    return true;
 }
 
+static void
+update_separable(struct zink_context *ctx, struct zink_program *pg)
+{
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   struct zink_batch_state *bs = ctx->batch.state;
+
+   unsigned use_buffer = 0;
+   /* find the least-written buffer to use for this */
+   for (unsigned i = 0; i < ARRAY_SIZE(bs->dd.db_offset); i++) {
+      if (bs->dd.db_offset[i] < bs->dd.db_offset[use_buffer])
+         use_buffer = i;
+   }
+   VkDescriptorGetInfoEXT info;
+   info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_GET_INFO_EXT;
+   info.pNext = NULL;
+   struct zink_gfx_program *prog = (struct zink_gfx_program *)pg;
+   struct zink_shader *shaders[] = {
+      prog->shaders[MESA_SHADER_VERTEX]->precompile.num_bindings ? prog->shaders[MESA_SHADER_VERTEX] : prog->shaders[MESA_SHADER_FRAGMENT],
+      prog->shaders[MESA_SHADER_FRAGMENT],
+   };
+   for (unsigned j = 0; j < pg->num_dsl; j++) {
+      if (!(pg->dd.binding_usage & BITFIELD_BIT(j)))
+         continue;
+      uint64_t offset = bs->dd.db_offset[use_buffer];
+      assert(bs->dd.db[use_buffer]->obj->size > bs->dd.db_offset[use_buffer] + pg->dd.db_size[j]);
+      for (unsigned i = 0; i < shaders[j]->precompile.num_bindings; i++) {
+         info.type = shaders[j]->precompile.bindings[i].descriptorType;
+         uint64_t desc_offset = offset + pg->dd.db_offset[j][i];
+         if (screen->info.db_props.combinedImageSamplerDescriptorSingleArray ||
+               shaders[j]->precompile.bindings[i].descriptorType != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
+               shaders[j]->precompile.bindings[i].descriptorCount == 1) {
+            for (unsigned k = 0; k < shaders[j]->precompile.bindings[i].descriptorCount; k++) {
+               /* VkDescriptorDataEXT is a union of pointers; the member doesn't matter */
+               info.data.pSampler = (void*)(((uint8_t*)ctx) + pg->dd.db_template[j][i].offset + k * pg->dd.db_template[j][i].stride);
+               VKSCR(GetDescriptorEXT)(screen->dev, &info, pg->dd.db_template[j][i].db_size, bs->dd.db_map[use_buffer] + desc_offset + k * pg->dd.db_template[j][i].db_size);
+            }
+         } else {
+            assert(shaders[j]->precompile.bindings[i].descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
+            char buf[1024];
+            uint8_t *db = bs->dd.db_map[use_buffer] + desc_offset;
+            uint8_t *samplers = db + shaders[j]->precompile.bindings[i].descriptorCount * screen->info.db_props.sampledImageDescriptorSize;
+            for (unsigned k = 0; k < shaders[j]->precompile.bindings[i].descriptorCount; k++) {
+               /* VkDescriptorDataEXT is a union of pointers; the member doesn't matter */
+               info.data.pSampler = (void*)(((uint8_t*)ctx) + pg->dd.db_template[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW][i].offset +
+                                             k * pg->dd.db_template[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW][i].stride);
+               VKSCR(GetDescriptorEXT)(screen->dev, &info, pg->dd.db_template[j][ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW].db_size, buf);
+               /* drivers that don't support combinedImageSamplerDescriptorSingleArray must have sampler arrays written in memory as
+                  *
+                  *   | array_of_samplers[] | array_of_sampled_images[] |
+                  * 
+                  * which means each descriptor's data must be split
+                  */
+               memcpy(db, buf, screen->info.db_props.samplerDescriptorSize);
+               memcpy(samplers, &buf[screen->info.db_props.samplerDescriptorSize], screen->info.db_props.sampledImageDescriptorSize);
+               db += screen->info.db_props.sampledImageDescriptorSize;
+               samplers += screen->info.db_props.samplerDescriptorSize;
+            }
+         }
+      }
+      bs->dd.cur_db_offset[use_buffer] = bs->dd.db_offset[use_buffer];
+      bs->dd.db_offset[use_buffer] += pg->dd.db_size[j];
+      VKCTX(CmdSetDescriptorBufferOffsetsEXT)(bs->cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, pg->layout, j, 1, &use_buffer, &offset);
+   }
+}
+
 /* updates the mask of changed_sets and binds the mask of bind_sets */
 static void
 zink_descriptors_update_masked_buffer(struct zink_context *ctx, bool is_compute, uint8_t changed_sets, uint8_t bind_sets)
@@ -1092,6 +1247,17 @@ zink_descriptors_update(struct zink_context *ctx, bool is_compute)
       ctx->dd.push_state_changed[is_compute] = !!pg->dd.push_usage || ctx->dd.has_fbfetch != bs->dd.has_fbfetch;
    }
 
+   if (!is_compute) {
+      struct zink_gfx_program *prog = (struct zink_gfx_program*)pg;
+      if (prog->is_separable) {
+         /* force all descriptors update on next pass: separables use different layouts */
+         ctx->dd.state_changed[is_compute] = BITFIELD_MASK(ZINK_DESCRIPTOR_TYPE_UNIFORMS);
+         ctx->dd.push_state_changed[is_compute] = true;
+         update_separable(ctx, pg);
+         return;
+      }
+   }
+
    if (pg != bs->dd.pg[is_compute]) {
       /* if we don't already know that we have to update all sets,
        * check to see if any dsls changed
diff --git a/src/gallium/drivers/zink/zink_descriptors.h b/src/gallium/drivers/zink/zink_descriptors.h
index c3705b693a7..e61d24c75d8 100644
--- a/src/gallium/drivers/zink/zink_descriptors.h
+++ b/src/gallium/drivers/zink/zink_descriptors.h
@@ -154,8 +154,12 @@ zink_descriptors_deinit_bindless(struct zink_context *ctx);
 void
 zink_descriptors_update_bindless(struct zink_context *ctx);
 
-
-
+void
+zink_descriptor_shader_get_binding_offsets(const struct zink_shader *shader, unsigned *offsets);
+void
+zink_descriptor_shader_init(struct zink_screen *screen, struct zink_shader *shader);
+void
+zink_descriptor_shader_deinit(struct zink_screen *screen, struct zink_shader *shader);
 
 bool
 zink_descriptor_program_init(struct zink_context *ctx, struct zink_program *pg);
diff --git a/src/gallium/drivers/zink/zink_pipeline.c b/src/gallium/drivers/zink/zink_pipeline.c
index bea5c155fc4..6d7f716e28b 100644
--- a/src/gallium/drivers/zink/zink_pipeline.c
+++ b/src/gallium/drivers/zink/zink_pipeline.c
@@ -751,7 +751,10 @@ create_gfx_pipeline_library(struct zink_screen *screen, VkShaderModule *modules,
 
    pci.pStages = shader_stages;
    pci.stageCount = num_stages;
-   /* only add LTO for full pipeline libs */
+   /* Only keep LTO information for full pipeline libs.  For separable shaders, they will only
+   * ever be used with fast linking, and to optimize them a new pipeline lib will be created with full
+   * link time information for the full set of shader stages (rather than linking in these single-stage libs).
+   */
    if (num_stages > 1)
       pci.flags |= VK_PIPELINE_CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT;
 
@@ -770,6 +773,12 @@ zink_create_gfx_pipeline_library(struct zink_screen *screen, struct zink_gfx_pro
    return create_gfx_pipeline_library(screen, prog->modules, prog->base.layout, prog->base.pipeline_cache);
 }
 
+VkPipeline
+zink_create_gfx_pipeline_separate(struct zink_screen *screen, VkShaderModule *modules, VkPipelineLayout layout)
+{
+   return create_gfx_pipeline_library(screen, modules, layout, VK_NULL_HANDLE);
+}
+
 VkPipeline
 zink_create_gfx_pipeline_combined(struct zink_screen *screen, struct zink_gfx_program *prog, VkPipeline input, VkPipeline *library, unsigned libcount, VkPipeline output, bool optimized)
 {
diff --git a/src/gallium/drivers/zink/zink_pipeline.h b/src/gallium/drivers/zink/zink_pipeline.h
index 11f86ac9fae..c6d5001c074 100644
--- a/src/gallium/drivers/zink/zink_pipeline.h
+++ b/src/gallium/drivers/zink/zink_pipeline.h
@@ -62,6 +62,8 @@ VkPipeline
 zink_create_gfx_pipeline_output(struct zink_screen *screen, struct zink_gfx_pipeline_state *state);
 VkPipeline
 zink_create_gfx_pipeline_combined(struct zink_screen *screen, struct zink_gfx_program *prog, VkPipeline input, VkPipeline *library, unsigned libcount, VkPipeline output, bool optimized);
+VkPipeline
+zink_create_gfx_pipeline_separate(struct zink_screen *screen, VkShaderModule *modules, VkPipelineLayout layout);
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/drivers/zink/zink_program.c b/src/gallium/drivers/zink/zink_program.c
index 02ec650145b..9a255255ec5 100644
--- a/src/gallium/drivers/zink/zink_program.c
+++ b/src/gallium/drivers/zink/zink_program.c
@@ -44,6 +44,11 @@
 #define XXH_INLINE_ALL
 #include "util/xxhash.h"
 
+static void
+precompile_job(void *data, void *gdata, int thread_index);
+struct zink_gfx_program *
+create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stages, unsigned vertices_per_patch);
+
 void
 debug_describe_zink_gfx_program(char *buf, const struct zink_gfx_program *ptr)
 {
@@ -645,6 +650,7 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr
 {
    const union zink_shader_key_optimal *optimal_key = (union zink_shader_key_optimal*)&prog->last_variant_hash;
    if (ctx->gfx_pipeline_state.shader_keys_optimal.key.vs_bits != optimal_key->vs_bits) {
+      assert(!prog->is_separable);
       bool changed = update_gfx_shader_module_optimal(ctx, prog, ctx->last_vertex_stage->nir->info.stage);
       ctx->gfx_pipeline_state.modules_changed |= changed;
    }
@@ -652,6 +658,7 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr
    if (ctx->gfx_pipeline_state.shader_keys_optimal.key.fs_bits != optimal_key->fs_bits ||
        /* always recheck shadow swizzles since they aren't directly part of the key */
        unlikely(shadow_needs_shader_swizzle)) {
+      assert(!prog->is_separable);
       bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_FRAGMENT);
       ctx->gfx_pipeline_state.modules_changed |= changed;
       if (unlikely(shadow_needs_shader_swizzle)) {
@@ -661,6 +668,7 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr
    }
    if (prog->shaders[MESA_SHADER_TESS_CTRL] && prog->shaders[MESA_SHADER_TESS_CTRL]->non_fs.is_generated &&
        ctx->gfx_pipeline_state.shader_keys_optimal.key.tcs_bits != optimal_key->tcs_bits) {
+      assert(!prog->is_separable);
       bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_TESS_CTRL);
       ctx->gfx_pipeline_state.modules_changed |= changed;
    }
@@ -682,13 +690,28 @@ zink_gfx_program_update_optimal(struct zink_context *ctx)
          ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
       if (entry) {
          prog = (struct zink_gfx_program*)entry->data;
+         if (prog->is_separable) {
+            /* shader variants can't be handled by separable programs: sync and compile */
+            if (!ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key))
+               util_queue_fence_wait(&prog->base.cache_fence);
+            /* If the optimized linked pipeline is done compiling, swap it into place. */
+            if (util_queue_fence_is_signalled(&prog->base.cache_fence)) {
+               struct zink_gfx_program *real = prog->full_prog;
+               entry->data = real;
+               prog->full_prog = NULL;
+               prog->base.removed = true;
+               zink_gfx_program_reference(zink_screen(ctx->base.screen), &prog, NULL);
+               prog = real;
+            }
+         }
          update_gfx_program_optimal(ctx, prog);
       } else {
          ctx->dirty_gfx_stages |= ctx->shader_stages;
-         prog = zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch);
+         prog = create_gfx_program_separable(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch);
          zink_screen_get_pipeline_cache(zink_screen(ctx->base.screen), &prog->base, false);
          _mesa_hash_table_insert_pre_hashed(ht, hash, prog->shaders, prog);
-         generate_gfx_program_modules_optimal(ctx, zink_screen(ctx->base.screen), prog, &ctx->gfx_pipeline_state);
+         if (!prog->is_separable)
+            generate_gfx_program_modules_optimal(ctx, zink_screen(ctx->base.screen), prog, &ctx->gfx_pipeline_state);
       }
       simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
       if (prog && prog != ctx->curr_program)
@@ -699,6 +722,24 @@ zink_gfx_program_update_optimal(struct zink_context *ctx)
       /* remove old hash */
       ctx->gfx_pipeline_state.optimal_key = ctx->gfx_pipeline_state.shader_keys_optimal.key.val;
       ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
+      if (ctx->curr_program->is_separable) {
+         struct zink_gfx_program *prog = ctx->curr_program;
+         if (prog->is_separable && !ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key)) {
+            util_queue_fence_wait(&prog->base.cache_fence);
+            /* shader variants can't be handled by separable programs: sync and compile */
+            struct hash_table *ht = &ctx->program_cache[zink_program_cache_stages(ctx->shader_stages)];
+            const uint32_t hash = ctx->gfx_hash;
+            simple_mtx_lock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
+            struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(ht, hash, ctx->gfx_stages);
+            struct zink_gfx_program *real = prog->full_prog;
+            entry->data = real;
+            prog->full_prog = NULL;
+            prog->base.removed = true;
+            zink_gfx_program_reference(zink_screen(ctx->base.screen), &prog, NULL);
+            ctx->curr_program = real;
+            simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]);
+         }
+      }
       update_gfx_program_optimal(ctx, ctx->curr_program);
       /* apply new hash */
       ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash;
@@ -969,6 +1010,112 @@ fail:
    return NULL;
 }
 
+/* Creates a replacement, optimized zink_gfx_program for this set of separate shaders, which will
+ * be swapped in in place of the fast-linked separable program once it's done compiling.
+ */
+static void
+create_linked_separable_job(void *data, void *gdata, int thread_index)
+{
+   struct zink_gfx_program *prog = data;
+   prog->full_prog = zink_create_gfx_program(prog->ctx, prog->shaders, 0);
+   precompile_job(prog->full_prog, gdata, thread_index);
+}
+
+struct zink_gfx_program *
+create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stages, unsigned vertices_per_patch)
+{
+   struct zink_screen *screen = zink_screen(ctx->base.screen);
+   unsigned shader_stages = BITFIELD_BIT(MESA_SHADER_VERTEX) | BITFIELD_BIT(MESA_SHADER_FRAGMENT);
+   /* filter cases that need real pipelines */
+   if (ctx->shader_stages != shader_stages ||
+       !stages[MESA_SHADER_VERTEX]->precompile.mod || !stages[MESA_SHADER_FRAGMENT]->precompile.mod ||
+       /* TODO: maybe try variants? grimace */
+       !ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key) ||
+       !zink_can_use_pipeline_libs(ctx))
+      return zink_create_gfx_program(ctx, stages, vertices_per_patch);
+   /* ensure async gpl creation is done */
+   util_queue_fence_wait(&stages[MESA_SHADER_VERTEX]->precompile.fence);
+   util_queue_fence_wait(&stages[MESA_SHADER_FRAGMENT]->precompile.fence);
+
+   struct zink_gfx_program *prog = create_program(ctx, false);
+   if (!prog)
+      goto fail;
+
+   prog->ctx = ctx;
+   prog->is_separable = true;
+
+   prog->shaders[MESA_SHADER_VERTEX] = stages[MESA_SHADER_VERTEX];
+   prog->stages_remaining = prog->stages_present = shader_stages;
+   prog->shaders[MESA_SHADER_FRAGMENT] = stages[MESA_SHADER_FRAGMENT];
+   prog->last_vertex_stage = stages[MESA_SHADER_VERTEX];
+   _mesa_set_init(&prog->libs, prog, hash_pipeline_lib, equals_pipeline_lib);
+
+   unsigned refs = 0;
+   for (int i = 0; i < ZINK_GFX_SHADER_COUNT; ++i) {
+      if (prog->shaders[i]) {
+         simple_mtx_lock(&prog->shaders[i]->lock);
+         _mesa_set_add(prog->shaders[i]->programs, prog);
+         simple_mtx_unlock(&prog->shaders[i]->lock);
+         refs++;
+      }
+   }
+   /* We can do this add after the _mesa_set_adds above because we know the prog->shaders[] are 
+   * referenced by the draw state and zink_shader_free() can't be called on them while we're in here.
+   */
+   p_atomic_add(&prog->base.reference.count, refs);
+
+   for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) {
+      for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) {
+         _mesa_hash_table_init(&prog->pipelines[r][i], prog, NULL, zink_get_gfx_pipeline_eq_func(screen, prog));
+         /* only need first 3/4 for point/line/tri/patch */
+         if (screen->info.have_EXT_extended_dynamic_state &&
+             i == (prog->last_vertex_stage->nir->info.stage == MESA_SHADER_TESS_EVAL ? 4 : 3))
+            break;
+      }
+   }
+
+   if (prog->shaders[MESA_SHADER_VERTEX]->precompile.dsl) {
+      prog->base.dd.binding_usage |= BITFIELD_BIT(0);
+      prog->base.dd.db_template[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.db_template;
+      prog->base.dd.db_size[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.db_size;
+      prog->base.dd.db_offset[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.db_offset;
+      prog->base.dsl[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.dsl;
+      prog->base.num_dsl++;
+   }
+   if (prog->shaders[MESA_SHADER_FRAGMENT]->precompile.dsl) {
+      prog->base.dd.binding_usage |= BITFIELD_BIT(1);
+      prog->base.dd.db_template[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.db_template;
+      prog->base.dd.db_size[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.db_size;
+      prog->base.dd.db_offset[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.db_offset;
+      prog->base.dsl[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.dsl;
+      /* guarantee a null dsl if vs doesn't have descriptors */
+      prog->base.num_dsl = 2;
+   }
+   prog->base.dd.bindless = prog->shaders[MESA_SHADER_VERTEX]->bindless | prog->shaders[MESA_SHADER_FRAGMENT]->bindless;
+   if (prog->base.dd.bindless) {
+      prog->base.num_dsl = screen->compact_descriptors ? ZINK_DESCRIPTOR_ALL_TYPES - ZINK_DESCRIPTOR_COMPACT : ZINK_DESCRIPTOR_ALL_TYPES;
+      prog->base.dsl[screen->desc_set_id[ZINK_DESCRIPTOR_BINDLESS]] = screen->bindless_layout;
+   }
+   prog->base.layout = zink_pipeline_layout_create(screen, prog->base.dsl, prog->base.num_dsl, false, VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT);
+
+   VkPipeline libs[] = {stages[MESA_SHADER_VERTEX]->precompile.gpl, stages[MESA_SHADER_FRAGMENT]->precompile.gpl};
+   prog->last_variant_hash = ctx->gfx_pipeline_state.optimal_key;
+
+   struct zink_gfx_library_key *gkey = rzalloc(prog, struct zink_gfx_library_key);
+   gkey->optimal_key = prog->last_variant_hash;
+   assert(gkey->optimal_key);
+   gkey->pipeline = zink_create_gfx_pipeline_combined(screen, prog, VK_NULL_HANDLE, libs, 2, VK_NULL_HANDLE, false);
+   _mesa_set_add(&prog->libs, gkey);
+
+   util_queue_add_job(&screen->cache_get_thread, prog, &prog->base.cache_fence, create_linked_separable_job, NULL, 0);
+
+   return prog;
+fail:
+   if (prog)
+      zink_destroy_gfx_program(screen, prog);
+   return NULL;
+}
+
 static uint32_t
 hash_compute_pipeline_state_local_size(const void *key)
 {
@@ -1203,6 +1350,8 @@ zink_destroy_gfx_program(struct zink_screen *screen,
       max_idx++;
    }
 
+   if (prog->is_separable)
+      zink_gfx_program_reference(screen, &prog->full_prog, NULL);
    for (unsigned r = 0; r < ARRAY_SIZE(prog->pipelines); r++) {
       for (int i = 0; i < max_idx; ++i) {
          hash_table_foreach(&prog->pipelines[r][i], entry) {
@@ -1223,11 +1372,13 @@ zink_destroy_gfx_program(struct zink_screen *screen,
          _mesa_set_remove_key(prog->shaders[i]->programs, prog);
          prog->shaders[i] = NULL;
       }
-      destroy_shader_cache(screen, &prog->shader_cache[i][0][0]);
-      destroy_shader_cache(screen, &prog->shader_cache[i][0][1]);
-      destroy_shader_cache(screen, &prog->shader_cache[i][1][0]);
-      destroy_shader_cache(screen, &prog->shader_cache[i][1][1]);
-      ralloc_free(prog->nir[i]);
+      if (!prog->is_separable) {
+         destroy_shader_cache(screen, &prog->shader_cache[i][0][0]);
+         destroy_shader_cache(screen, &prog->shader_cache[i][0][1]);
+         destroy_shader_cache(screen, &prog->shader_cache[i][1][0]);
+         destroy_shader_cache(screen, &prog->shader_cache[i][1][1]);
+         ralloc_free(prog->nir[i]);
+      }
    }
 
    set_foreach_remove(&prog->libs, he) {
@@ -1761,6 +1912,20 @@ precompile_job(void *data, void *gdata, int thread_index)
    zink_screen_update_pipeline_cache(screen, &prog->base, true);
 }
 
+static void
+precompile_separate_shader_job(void *data, void *gdata, int thread_index)
+{
+   struct zink_screen *screen = gdata;
+   struct zink_shader *zs = data;
+
+   nir_shader *nir;
+   zs->precompile.mod = zink_shader_compile_separate(screen, zs, &nir);
+   zink_descriptor_shader_init(screen, zs);
+   VkShaderModule mods[ZINK_GFX_SHADER_COUNT] = {0};
+   mods[nir->info.stage] = zs->precompile.mod;
+   zs->precompile.gpl = zink_create_gfx_pipeline_separate(screen, mods, zs->precompile.layout);
+}
+
 static void
 zink_link_gfx_shader(struct pipe_context *pctx, void **shaders)
 {
@@ -1769,8 +1934,17 @@ zink_link_gfx_shader(struct pipe_context *pctx, void **shaders)
    if (shaders[MESA_SHADER_COMPUTE])
       return;
    /* can't precompile fixedfunc */
-   if (!shaders[MESA_SHADER_VERTEX] || !shaders[MESA_SHADER_FRAGMENT])
+   if (!shaders[MESA_SHADER_VERTEX] || !shaders[MESA_SHADER_FRAGMENT]) {
+      if (shaders[MESA_SHADER_VERTEX] || shaders[MESA_SHADER_FRAGMENT]) {
+         struct zink_shader *zs = shaders[MESA_SHADER_VERTEX] ? shaders[MESA_SHADER_VERTEX] : shaders[MESA_SHADER_FRAGMENT];
+         if (zs->nir->info.separate_shader && !zs->precompile.mod && util_queue_fence_is_signalled(&zs->precompile.fence) &&
+             zink_descriptor_mode == ZINK_DESCRIPTOR_MODE_DB &&
+             /* sample shading can't precompile */
+             (!shaders[MESA_SHADER_FRAGMENT] || !zs->nir->info.fs.uses_sample_shading))
+            util_queue_add_job(&zink_screen(pctx->screen)->cache_get_thread, zs, &zs->precompile.fence, precompile_separate_shader_job, NULL, 0);
+      }
       return;
+   }
    unsigned hash = 0;
    unsigned shader_stages = 0;
    for (unsigned i = 0; i < ZINK_GFX_SHADER_COUNT; i++) {
diff --git a/src/gallium/drivers/zink/zink_program_state.hpp b/src/gallium/drivers/zink/zink_program_state.hpp
index dba466455cc..45550cee83a 100644
--- a/src/gallium/drivers/zink/zink_program_state.hpp
+++ b/src/gallium/drivers/zink/zink_program_state.hpp
@@ -190,10 +190,12 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
          /* this is the graphics pipeline library path: find/construct all partial pipelines */
          struct set_entry *he = _mesa_set_search(&prog->libs, &ctx->gfx_pipeline_state.optimal_key);
          struct zink_gfx_library_key *gkey;
-         if (he)
+         if (he) {
             gkey = (struct zink_gfx_library_key *)he->key;
-         else
+         } else {
+            assert(!prog->is_separable);
             gkey = zink_create_pipeline_lib(screen, prog, &ctx->gfx_pipeline_state);
+         }
          struct zink_gfx_input_key *ikey = DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT ?
                                              zink_find_or_create_input_dynamic(ctx, vkmode) :
                                              zink_find_or_create_input(ctx, vkmode);
@@ -215,7 +217,7 @@ zink_get_gfx_pipeline(struct zink_context *ctx,
 
       zink_screen_update_pipeline_cache(screen, &prog->base, false);
       pc_entry->pipeline = pipeline;
-      if (HAVE_LIB)
+      if (HAVE_LIB && !prog->is_separable)
          /* trigger async optimized pipeline compile if this was the fast-linked unoptimized pipeline */
          zink_gfx_program_compile_queue(ctx, pc_entry);
    }
diff --git a/src/gallium/drivers/zink/zink_screen.c b/src/gallium/drivers/zink/zink_screen.c
index 144fe4617fd..c5fd4377bed 100644
--- a/src/gallium/drivers/zink/zink_screen.c
+++ b/src/gallium/drivers/zink/zink_screen.c
@@ -188,6 +188,8 @@ zink_is_parallel_shader_compilation_finished(struct pipe_screen *screen, void *s
    }
 
    struct zink_shader *zs = shader;
+   if (!util_queue_fence_is_signalled(&zs->precompile.fence))
+      return false;
    bool finished = true;
    set_foreach(zs->programs, entry) {
       struct zink_gfx_program *prog = (void*)entry->key;
diff --git a/src/gallium/drivers/zink/zink_types.h b/src/gallium/drivers/zink/zink_types.h
index b81a1d0e2d0..806429d28c2 100644
--- a/src/gallium/drivers/zink/zink_types.h
+++ b/src/gallium/drivers/zink/zink_types.h
@@ -732,6 +732,19 @@ struct zink_shader {
    bool has_uniforms;
    struct spirv_shader *spirv;
 
+   struct {
+      struct util_queue_fence fence;
+      VkShaderModule mod;
+      VkDescriptorSetLayout dsl;
+      VkPipelineLayout layout;
+      VkPipeline gpl;
+      VkDescriptorSetLayoutBinding *bindings;
+      unsigned num_bindings;
+      struct zink_descriptor_template *db_template;
+      unsigned db_size;
+      unsigned *db_offset;
+   } precompile;
+
    simple_mtx_t lock;
    struct set *programs;
 
@@ -973,26 +986,30 @@ struct zink_gfx_pipeline_cache_entry {
 struct zink_gfx_program {
    struct zink_program base;
 
+   bool is_separable; //not a full program
    struct zink_context *ctx; //the owner context
 
    uint32_t stages_present; //mask of stages present in this program
    uint32_t stages_remaining; //mask of zink_shader remaining in this program
-   struct nir_shader *nir[ZINK_GFX_SHADER_COUNT];
-
-   VkShaderModule modules[ZINK_GFX_SHADER_COUNT]; // compute stage doesn't belong here
-   uint32_t module_hash[ZINK_GFX_SHADER_COUNT];
-
-   struct zink_shader *last_vertex_stage;
-
-   struct util_dynarray shader_cache[ZINK_GFX_SHADER_COUNT][2][2]; //normal, nonseamless cubes, inline uniforms
-   unsigned inlined_variant_count[ZINK_GFX_SHADER_COUNT];
 
    struct zink_shader *shaders[ZINK_GFX_SHADER_COUNT];
-   struct hash_table pipelines[2][11]; // [dynamic, renderpass][number of draw modes we support]
+   struct zink_shader *last_vertex_stage;
+
+   /* full */
+   VkShaderModule modules[ZINK_GFX_SHADER_COUNT]; // compute stage doesn't belong here
+   uint32_t module_hash[ZINK_GFX_SHADER_COUNT];
+   struct nir_shader *nir[ZINK_GFX_SHADER_COUNT];
+   struct util_dynarray shader_cache[ZINK_GFX_SHADER_COUNT][2][2]; //normal, nonseamless cubes, inline uniforms
+   unsigned inlined_variant_count[ZINK_GFX_SHADER_COUNT];
    uint32_t default_variant_hash;
-   uint32_t last_variant_hash;
    uint8_t inline_variants; //which stages are using inlined uniforms
 
+   /* separable */
+   struct zink_gfx_program *full_prog;
+
+   struct hash_table pipelines[2][11]; // [dynamic, renderpass][number of draw modes we support]
+   uint32_t last_variant_hash;
+
    uint32_t last_finalized_hash[2][4]; //[dynamic, renderpass][primtype idx]
    VkPipeline last_pipeline[2][4]; //[dynamic, renderpass][primtype idx]