From e3b746e3a31e350e9f0962717e49acba28efee30 Mon Sep 17 00:00:00 2001 From: Mike Blumenkrantz Date: Tue, 7 Feb 2023 13:32:21 -0500 Subject: [PATCH] zink: use GPL to handle (simple) separate shader objects apps/games using separate shader objects end up passing the separable shaders to the link_shader hook individually, which is still not ideal for zink's usage since the more optimal path is to have all the shaders and create a RAST+FS GPL stage that can run all the inter-stage io handlers it IS technically possible to handle this for simple VS+FS pipelines using GPL, however, but it's kinda gross. such shaders now use descriptor buffer to create their own pipelines/layouts/descriptors async, and then a "separable" variant of the gfx program can be created by fast-linking these together the "separable" gfx program can't handle shader variants, but it can do basic pipeline caching for PSO state changes, which makes it flexible enough to sorta kinda maybe handle the most basic cases of separate shader objects descriptor buffer is used because having to create and manage a separate architecture for sets/pools/templates is too nightmarish even for me this is, at best, a partial solution, but it's the best the vulkan api can currently do Part-of: --- src/gallium/drivers/zink/zink_compiler.c | 44 ++++ src/gallium/drivers/zink/zink_compiler.h | 2 + src/gallium/drivers/zink/zink_descriptors.c | 166 +++++++++++++++ src/gallium/drivers/zink/zink_descriptors.h | 8 +- src/gallium/drivers/zink/zink_pipeline.c | 11 +- src/gallium/drivers/zink/zink_pipeline.h | 2 + src/gallium/drivers/zink/zink_program.c | 190 +++++++++++++++++- .../drivers/zink/zink_program_state.hpp | 8 +- src/gallium/drivers/zink/zink_screen.c | 2 + src/gallium/drivers/zink/zink_types.h | 39 +++- 10 files changed, 447 insertions(+), 25 deletions(-) diff --git a/src/gallium/drivers/zink/zink_compiler.c b/src/gallium/drivers/zink/zink_compiler.c index c8d22f1f076..2352fe17aa4 100644 --- a/src/gallium/drivers/zink/zink_compiler.c +++ b/src/gallium/drivers/zink/zink_compiler.c @@ -24,6 +24,7 @@ #include "nir_opcodes.h" #include "zink_context.h" #include "zink_compiler.h" +#include "zink_descriptors.h" #include "zink_program.h" #include "zink_screen.h" #include "nir_to_spirv/nir_to_spirv.h" @@ -3205,6 +3206,39 @@ zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs, return mod; } +VkShaderModule +zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs, nir_shader **ret_nir) +{ + nir_shader *nir = nir_shader_clone(NULL, zs->nir); + int set = nir->info.stage == MESA_SHADER_FRAGMENT; + unsigned offsets[4]; + zink_descriptor_shader_get_binding_offsets(zs, offsets); + nir_foreach_variable_with_modes(var, nir, nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_uniform | nir_var_image) { + if (var->data.bindless) + continue; + var->data.descriptor_set = set; + switch (var->data.mode) { + case nir_var_mem_ubo: + var->data.binding = !!var->data.driver_location; + break; + case nir_var_uniform: + if (glsl_type_is_sampler(glsl_without_array(var->type))) + var->data.binding += offsets[1]; + break; + case nir_var_mem_ssbo: + var->data.binding += offsets[2]; + break; + case nir_var_image: + var->data.binding += offsets[3]; + break; + default: break; + } + } + optimize_nir(nir, zs); + *ret_nir = nir; + return compile_module(screen, zs, nir); +} + static bool lower_baseinstance_instr(nir_builder *b, nir_instr *instr, void *data) { @@ -4196,6 +4230,7 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir, ret->sinfo.have_vulkan_memory_model = screen->info.have_KHR_vulkan_memory_model; + util_queue_fence_init(&ret->precompile.fence); ret->hash = _mesa_hash_pointer(ret); ret->programs = _mesa_pointer_set_create(NULL); @@ -4490,8 +4525,16 @@ zink_shader_free(struct zink_screen *screen, struct zink_shader *shader) shader->non_fs.generated_gs = NULL; } _mesa_set_destroy(shader->programs, NULL); + util_queue_fence_wait(&shader->precompile.fence); + util_queue_fence_destroy(&shader->precompile.fence); + zink_descriptor_shader_deinit(screen, shader); + if (shader->precompile.mod) + VKSCR(DestroyShaderModule)(screen->dev, shader->precompile.mod, NULL); + if (shader->precompile.gpl) + VKSCR(DestroyPipeline)(screen->dev, shader->precompile.gpl, NULL); ralloc_free(shader->nir); ralloc_free(shader->spirv); + free(shader->precompile.bindings); ralloc_free(shader); } @@ -4530,6 +4573,7 @@ struct zink_shader * zink_shader_tcs_create(struct zink_screen *screen, struct zink_shader *vs, unsigned vertices_per_patch) { struct zink_shader *ret = rzalloc(NULL, struct zink_shader); + util_queue_fence_init(&ret->precompile.fence); ret->hash = _mesa_hash_pointer(ret); ret->programs = _mesa_pointer_set_create(NULL); simple_mtx_init(&ret->lock, mtx_plain); diff --git a/src/gallium/drivers/zink/zink_compiler.h b/src/gallium/drivers/zink/zink_compiler.h index a1c894d3853..30a3111e68f 100644 --- a/src/gallium/drivers/zink/zink_compiler.h +++ b/src/gallium/drivers/zink/zink_compiler.h @@ -63,6 +63,8 @@ zink_compiler_assign_io(struct zink_screen *screen, nir_shader *producer, nir_sh VkShaderModule zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs, nir_shader *nir, const struct zink_shader_key *key, const void *extra_data); VkShaderModule +zink_shader_compile_separate(struct zink_screen *screen, struct zink_shader *zs, nir_shader **ret_nir); +VkShaderModule zink_shader_spirv_compile(struct zink_screen *screen, struct zink_shader *zs, struct spirv_shader *spirv); struct zink_shader * zink_shader_create(struct zink_screen *screen, struct nir_shader *nir, diff --git a/src/gallium/drivers/zink/zink_descriptors.c b/src/gallium/drivers/zink/zink_descriptors.c index cc3519d65e1..68f0ef07fe0 100644 --- a/src/gallium/drivers/zink/zink_descriptors.c +++ b/src/gallium/drivers/zink/zink_descriptors.c @@ -670,6 +670,96 @@ zink_descriptor_program_init(struct zink_context *ctx, struct zink_program *pg) return true; } +void +zink_descriptor_shader_get_binding_offsets(const struct zink_shader *shader, unsigned *offsets) +{ + offsets[ZINK_DESCRIPTOR_TYPE_UBO] = 0; + offsets[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW] = shader->bindings[ZINK_DESCRIPTOR_TYPE_UBO][shader->num_bindings[ZINK_DESCRIPTOR_TYPE_UBO] - 1].binding + 1; + offsets[ZINK_DESCRIPTOR_TYPE_SSBO] = offsets[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW] + shader->bindings[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW][shader->num_bindings[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW] - 1].binding + 1; + offsets[ZINK_DESCRIPTOR_TYPE_IMAGE] = offsets[ZINK_DESCRIPTOR_TYPE_SSBO] + shader->bindings[ZINK_DESCRIPTOR_TYPE_SSBO][shader->num_bindings[ZINK_DESCRIPTOR_TYPE_SSBO] - 1].binding + 1; +} + +void +zink_descriptor_shader_init(struct zink_screen *screen, struct zink_shader *shader) +{ + VkDescriptorSetLayoutBinding bindings[ZINK_DESCRIPTOR_BASE_TYPES * ZINK_MAX_DESCRIPTORS_PER_TYPE]; + unsigned num_bindings = 0; + VkShaderStageFlagBits stage_flags = mesa_to_vk_shader_stage(shader->nir->info.stage); + + unsigned desc_set_size = shader->has_uniforms; + for (unsigned i = 0; i < ZINK_DESCRIPTOR_BASE_TYPES; i++) + desc_set_size += shader->num_bindings[i]; + if (desc_set_size) + shader->precompile.db_template = rzalloc_array(shader, struct zink_descriptor_template, desc_set_size); + + if (shader->has_uniforms) { + VkDescriptorSetLayoutBinding *binding = &bindings[num_bindings]; + binding->binding = 0; + binding->descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + binding->descriptorCount = 1; + binding->stageFlags = stage_flags; + binding->pImmutableSamplers = NULL; + struct zink_descriptor_template *entry = &shader->precompile.db_template[num_bindings]; + entry->count = 1; + entry->offset = offsetof(struct zink_context, di.db.ubos[shader->nir->info.stage][0]); + entry->stride = sizeof(VkDescriptorAddressInfoEXT); + entry->db_size = screen->info.db_props.robustUniformBufferDescriptorSize; + num_bindings++; + } + /* sync with zink_shader_compile_separate() */ + unsigned offsets[4]; + zink_descriptor_shader_get_binding_offsets(shader, offsets); + for (int j = 0; j < ZINK_DESCRIPTOR_BASE_TYPES; j++) { + for (int k = 0; k < shader->num_bindings[j]; k++) { + VkDescriptorSetLayoutBinding *binding = &bindings[num_bindings]; + if (j == ZINK_DESCRIPTOR_TYPE_UBO) + binding->binding = 1; + else + binding->binding = shader->bindings[j][k].binding + offsets[j]; + binding->descriptorType = shader->bindings[j][k].type; + binding->descriptorCount = shader->bindings[j][k].size; + binding->stageFlags = stage_flags; + binding->pImmutableSamplers = NULL; + + unsigned temp = 0; + init_db_template_entry(screen, shader, j, k, &shader->precompile.db_template[num_bindings], &temp); + num_bindings++; + } + } + if (num_bindings) { + shader->precompile.dsl = descriptor_layout_create(screen, 0, bindings, num_bindings); + shader->precompile.bindings = mem_dup(bindings, num_bindings * sizeof(VkDescriptorSetLayoutBinding)); + shader->precompile.num_bindings = num_bindings; + VkDeviceSize val; + VKSCR(GetDescriptorSetLayoutSizeEXT)(screen->dev, shader->precompile.dsl, &val); + shader->precompile.db_size = val; + shader->precompile.db_offset = rzalloc_array(shader, uint32_t, num_bindings); + for (unsigned i = 0; i < num_bindings; i++) { + VKSCR(GetDescriptorSetLayoutBindingOffsetEXT)(screen->dev, shader->precompile.dsl, bindings[i].binding, &val); + shader->precompile.db_offset[i] = val; + } + } + VkDescriptorSetLayout dsl[ZINK_DESCRIPTOR_ALL_TYPES] = {0}; + unsigned num_dsl = num_bindings ? 2 : 0; + if (shader->bindless) + num_dsl = screen->compact_descriptors ? ZINK_DESCRIPTOR_ALL_TYPES - ZINK_DESCRIPTOR_COMPACT : ZINK_DESCRIPTOR_ALL_TYPES; + if (num_bindings || shader->bindless) { + dsl[shader->nir->info.stage == MESA_SHADER_FRAGMENT] = shader->precompile.dsl; + if (shader->bindless) + dsl[screen->desc_set_id[ZINK_DESCRIPTOR_BINDLESS]] = screen->bindless_layout; + } + shader->precompile.layout = zink_pipeline_layout_create(screen, dsl, num_dsl, false, VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT); +} + +void +zink_descriptor_shader_deinit(struct zink_screen *screen, struct zink_shader *shader) +{ + if (shader->precompile.dsl) + VKSCR(DestroyDescriptorSetLayout)(screen->dev, shader->precompile.dsl, NULL); + if (shader->precompile.layout) + VKSCR(DestroyPipelineLayout)(screen->dev, shader->precompile.layout, NULL); +} + /* called during program destroy */ void zink_descriptor_program_deinit(struct zink_screen *screen, struct zink_program *pg) @@ -946,6 +1036,71 @@ populate_sets(struct zink_context *ctx, struct zink_batch_state *bs, return true; } +static void +update_separable(struct zink_context *ctx, struct zink_program *pg) +{ + struct zink_screen *screen = zink_screen(ctx->base.screen); + struct zink_batch_state *bs = ctx->batch.state; + + unsigned use_buffer = 0; + /* find the least-written buffer to use for this */ + for (unsigned i = 0; i < ARRAY_SIZE(bs->dd.db_offset); i++) { + if (bs->dd.db_offset[i] < bs->dd.db_offset[use_buffer]) + use_buffer = i; + } + VkDescriptorGetInfoEXT info; + info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_GET_INFO_EXT; + info.pNext = NULL; + struct zink_gfx_program *prog = (struct zink_gfx_program *)pg; + struct zink_shader *shaders[] = { + prog->shaders[MESA_SHADER_VERTEX]->precompile.num_bindings ? prog->shaders[MESA_SHADER_VERTEX] : prog->shaders[MESA_SHADER_FRAGMENT], + prog->shaders[MESA_SHADER_FRAGMENT], + }; + for (unsigned j = 0; j < pg->num_dsl; j++) { + if (!(pg->dd.binding_usage & BITFIELD_BIT(j))) + continue; + uint64_t offset = bs->dd.db_offset[use_buffer]; + assert(bs->dd.db[use_buffer]->obj->size > bs->dd.db_offset[use_buffer] + pg->dd.db_size[j]); + for (unsigned i = 0; i < shaders[j]->precompile.num_bindings; i++) { + info.type = shaders[j]->precompile.bindings[i].descriptorType; + uint64_t desc_offset = offset + pg->dd.db_offset[j][i]; + if (screen->info.db_props.combinedImageSamplerDescriptorSingleArray || + shaders[j]->precompile.bindings[i].descriptorType != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER || + shaders[j]->precompile.bindings[i].descriptorCount == 1) { + for (unsigned k = 0; k < shaders[j]->precompile.bindings[i].descriptorCount; k++) { + /* VkDescriptorDataEXT is a union of pointers; the member doesn't matter */ + info.data.pSampler = (void*)(((uint8_t*)ctx) + pg->dd.db_template[j][i].offset + k * pg->dd.db_template[j][i].stride); + VKSCR(GetDescriptorEXT)(screen->dev, &info, pg->dd.db_template[j][i].db_size, bs->dd.db_map[use_buffer] + desc_offset + k * pg->dd.db_template[j][i].db_size); + } + } else { + assert(shaders[j]->precompile.bindings[i].descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); + char buf[1024]; + uint8_t *db = bs->dd.db_map[use_buffer] + desc_offset; + uint8_t *samplers = db + shaders[j]->precompile.bindings[i].descriptorCount * screen->info.db_props.sampledImageDescriptorSize; + for (unsigned k = 0; k < shaders[j]->precompile.bindings[i].descriptorCount; k++) { + /* VkDescriptorDataEXT is a union of pointers; the member doesn't matter */ + info.data.pSampler = (void*)(((uint8_t*)ctx) + pg->dd.db_template[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW][i].offset + + k * pg->dd.db_template[ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW][i].stride); + VKSCR(GetDescriptorEXT)(screen->dev, &info, pg->dd.db_template[j][ZINK_DESCRIPTOR_TYPE_SAMPLER_VIEW].db_size, buf); + /* drivers that don't support combinedImageSamplerDescriptorSingleArray must have sampler arrays written in memory as + * + * | array_of_samplers[] | array_of_sampled_images[] | + * + * which means each descriptor's data must be split + */ + memcpy(db, buf, screen->info.db_props.samplerDescriptorSize); + memcpy(samplers, &buf[screen->info.db_props.samplerDescriptorSize], screen->info.db_props.sampledImageDescriptorSize); + db += screen->info.db_props.sampledImageDescriptorSize; + samplers += screen->info.db_props.samplerDescriptorSize; + } + } + } + bs->dd.cur_db_offset[use_buffer] = bs->dd.db_offset[use_buffer]; + bs->dd.db_offset[use_buffer] += pg->dd.db_size[j]; + VKCTX(CmdSetDescriptorBufferOffsetsEXT)(bs->cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, pg->layout, j, 1, &use_buffer, &offset); + } +} + /* updates the mask of changed_sets and binds the mask of bind_sets */ static void zink_descriptors_update_masked_buffer(struct zink_context *ctx, bool is_compute, uint8_t changed_sets, uint8_t bind_sets) @@ -1092,6 +1247,17 @@ zink_descriptors_update(struct zink_context *ctx, bool is_compute) ctx->dd.push_state_changed[is_compute] = !!pg->dd.push_usage || ctx->dd.has_fbfetch != bs->dd.has_fbfetch; } + if (!is_compute) { + struct zink_gfx_program *prog = (struct zink_gfx_program*)pg; + if (prog->is_separable) { + /* force all descriptors update on next pass: separables use different layouts */ + ctx->dd.state_changed[is_compute] = BITFIELD_MASK(ZINK_DESCRIPTOR_TYPE_UNIFORMS); + ctx->dd.push_state_changed[is_compute] = true; + update_separable(ctx, pg); + return; + } + } + if (pg != bs->dd.pg[is_compute]) { /* if we don't already know that we have to update all sets, * check to see if any dsls changed diff --git a/src/gallium/drivers/zink/zink_descriptors.h b/src/gallium/drivers/zink/zink_descriptors.h index c3705b693a7..e61d24c75d8 100644 --- a/src/gallium/drivers/zink/zink_descriptors.h +++ b/src/gallium/drivers/zink/zink_descriptors.h @@ -154,8 +154,12 @@ zink_descriptors_deinit_bindless(struct zink_context *ctx); void zink_descriptors_update_bindless(struct zink_context *ctx); - - +void +zink_descriptor_shader_get_binding_offsets(const struct zink_shader *shader, unsigned *offsets); +void +zink_descriptor_shader_init(struct zink_screen *screen, struct zink_shader *shader); +void +zink_descriptor_shader_deinit(struct zink_screen *screen, struct zink_shader *shader); bool zink_descriptor_program_init(struct zink_context *ctx, struct zink_program *pg); diff --git a/src/gallium/drivers/zink/zink_pipeline.c b/src/gallium/drivers/zink/zink_pipeline.c index bea5c155fc4..6d7f716e28b 100644 --- a/src/gallium/drivers/zink/zink_pipeline.c +++ b/src/gallium/drivers/zink/zink_pipeline.c @@ -751,7 +751,10 @@ create_gfx_pipeline_library(struct zink_screen *screen, VkShaderModule *modules, pci.pStages = shader_stages; pci.stageCount = num_stages; - /* only add LTO for full pipeline libs */ + /* Only keep LTO information for full pipeline libs. For separable shaders, they will only + * ever be used with fast linking, and to optimize them a new pipeline lib will be created with full + * link time information for the full set of shader stages (rather than linking in these single-stage libs). + */ if (num_stages > 1) pci.flags |= VK_PIPELINE_CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT; @@ -770,6 +773,12 @@ zink_create_gfx_pipeline_library(struct zink_screen *screen, struct zink_gfx_pro return create_gfx_pipeline_library(screen, prog->modules, prog->base.layout, prog->base.pipeline_cache); } +VkPipeline +zink_create_gfx_pipeline_separate(struct zink_screen *screen, VkShaderModule *modules, VkPipelineLayout layout) +{ + return create_gfx_pipeline_library(screen, modules, layout, VK_NULL_HANDLE); +} + VkPipeline zink_create_gfx_pipeline_combined(struct zink_screen *screen, struct zink_gfx_program *prog, VkPipeline input, VkPipeline *library, unsigned libcount, VkPipeline output, bool optimized) { diff --git a/src/gallium/drivers/zink/zink_pipeline.h b/src/gallium/drivers/zink/zink_pipeline.h index 11f86ac9fae..c6d5001c074 100644 --- a/src/gallium/drivers/zink/zink_pipeline.h +++ b/src/gallium/drivers/zink/zink_pipeline.h @@ -62,6 +62,8 @@ VkPipeline zink_create_gfx_pipeline_output(struct zink_screen *screen, struct zink_gfx_pipeline_state *state); VkPipeline zink_create_gfx_pipeline_combined(struct zink_screen *screen, struct zink_gfx_program *prog, VkPipeline input, VkPipeline *library, unsigned libcount, VkPipeline output, bool optimized); +VkPipeline +zink_create_gfx_pipeline_separate(struct zink_screen *screen, VkShaderModule *modules, VkPipelineLayout layout); #ifdef __cplusplus } #endif diff --git a/src/gallium/drivers/zink/zink_program.c b/src/gallium/drivers/zink/zink_program.c index 02ec650145b..9a255255ec5 100644 --- a/src/gallium/drivers/zink/zink_program.c +++ b/src/gallium/drivers/zink/zink_program.c @@ -44,6 +44,11 @@ #define XXH_INLINE_ALL #include "util/xxhash.h" +static void +precompile_job(void *data, void *gdata, int thread_index); +struct zink_gfx_program * +create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stages, unsigned vertices_per_patch); + void debug_describe_zink_gfx_program(char *buf, const struct zink_gfx_program *ptr) { @@ -645,6 +650,7 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr { const union zink_shader_key_optimal *optimal_key = (union zink_shader_key_optimal*)&prog->last_variant_hash; if (ctx->gfx_pipeline_state.shader_keys_optimal.key.vs_bits != optimal_key->vs_bits) { + assert(!prog->is_separable); bool changed = update_gfx_shader_module_optimal(ctx, prog, ctx->last_vertex_stage->nir->info.stage); ctx->gfx_pipeline_state.modules_changed |= changed; } @@ -652,6 +658,7 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr if (ctx->gfx_pipeline_state.shader_keys_optimal.key.fs_bits != optimal_key->fs_bits || /* always recheck shadow swizzles since they aren't directly part of the key */ unlikely(shadow_needs_shader_swizzle)) { + assert(!prog->is_separable); bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_FRAGMENT); ctx->gfx_pipeline_state.modules_changed |= changed; if (unlikely(shadow_needs_shader_swizzle)) { @@ -661,6 +668,7 @@ update_gfx_program_optimal(struct zink_context *ctx, struct zink_gfx_program *pr } if (prog->shaders[MESA_SHADER_TESS_CTRL] && prog->shaders[MESA_SHADER_TESS_CTRL]->non_fs.is_generated && ctx->gfx_pipeline_state.shader_keys_optimal.key.tcs_bits != optimal_key->tcs_bits) { + assert(!prog->is_separable); bool changed = update_gfx_shader_module_optimal(ctx, prog, MESA_SHADER_TESS_CTRL); ctx->gfx_pipeline_state.modules_changed |= changed; } @@ -682,13 +690,28 @@ zink_gfx_program_update_optimal(struct zink_context *ctx) ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash; if (entry) { prog = (struct zink_gfx_program*)entry->data; + if (prog->is_separable) { + /* shader variants can't be handled by separable programs: sync and compile */ + if (!ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key)) + util_queue_fence_wait(&prog->base.cache_fence); + /* If the optimized linked pipeline is done compiling, swap it into place. */ + if (util_queue_fence_is_signalled(&prog->base.cache_fence)) { + struct zink_gfx_program *real = prog->full_prog; + entry->data = real; + prog->full_prog = NULL; + prog->base.removed = true; + zink_gfx_program_reference(zink_screen(ctx->base.screen), &prog, NULL); + prog = real; + } + } update_gfx_program_optimal(ctx, prog); } else { ctx->dirty_gfx_stages |= ctx->shader_stages; - prog = zink_create_gfx_program(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch); + prog = create_gfx_program_separable(ctx, ctx->gfx_stages, ctx->gfx_pipeline_state.dyn_state2.vertices_per_patch); zink_screen_get_pipeline_cache(zink_screen(ctx->base.screen), &prog->base, false); _mesa_hash_table_insert_pre_hashed(ht, hash, prog->shaders, prog); - generate_gfx_program_modules_optimal(ctx, zink_screen(ctx->base.screen), prog, &ctx->gfx_pipeline_state); + if (!prog->is_separable) + generate_gfx_program_modules_optimal(ctx, zink_screen(ctx->base.screen), prog, &ctx->gfx_pipeline_state); } simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]); if (prog && prog != ctx->curr_program) @@ -699,6 +722,24 @@ zink_gfx_program_update_optimal(struct zink_context *ctx) /* remove old hash */ ctx->gfx_pipeline_state.optimal_key = ctx->gfx_pipeline_state.shader_keys_optimal.key.val; ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash; + if (ctx->curr_program->is_separable) { + struct zink_gfx_program *prog = ctx->curr_program; + if (prog->is_separable && !ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key)) { + util_queue_fence_wait(&prog->base.cache_fence); + /* shader variants can't be handled by separable programs: sync and compile */ + struct hash_table *ht = &ctx->program_cache[zink_program_cache_stages(ctx->shader_stages)]; + const uint32_t hash = ctx->gfx_hash; + simple_mtx_lock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]); + struct hash_entry *entry = _mesa_hash_table_search_pre_hashed(ht, hash, ctx->gfx_stages); + struct zink_gfx_program *real = prog->full_prog; + entry->data = real; + prog->full_prog = NULL; + prog->base.removed = true; + zink_gfx_program_reference(zink_screen(ctx->base.screen), &prog, NULL); + ctx->curr_program = real; + simple_mtx_unlock(&ctx->program_lock[zink_program_cache_stages(ctx->shader_stages)]); + } + } update_gfx_program_optimal(ctx, ctx->curr_program); /* apply new hash */ ctx->gfx_pipeline_state.final_hash ^= ctx->curr_program->last_variant_hash; @@ -969,6 +1010,112 @@ fail: return NULL; } +/* Creates a replacement, optimized zink_gfx_program for this set of separate shaders, which will + * be swapped in in place of the fast-linked separable program once it's done compiling. + */ +static void +create_linked_separable_job(void *data, void *gdata, int thread_index) +{ + struct zink_gfx_program *prog = data; + prog->full_prog = zink_create_gfx_program(prog->ctx, prog->shaders, 0); + precompile_job(prog->full_prog, gdata, thread_index); +} + +struct zink_gfx_program * +create_gfx_program_separable(struct zink_context *ctx, struct zink_shader **stages, unsigned vertices_per_patch) +{ + struct zink_screen *screen = zink_screen(ctx->base.screen); + unsigned shader_stages = BITFIELD_BIT(MESA_SHADER_VERTEX) | BITFIELD_BIT(MESA_SHADER_FRAGMENT); + /* filter cases that need real pipelines */ + if (ctx->shader_stages != shader_stages || + !stages[MESA_SHADER_VERTEX]->precompile.mod || !stages[MESA_SHADER_FRAGMENT]->precompile.mod || + /* TODO: maybe try variants? grimace */ + !ZINK_SHADER_KEY_OPTIMAL_IS_DEFAULT(ctx->gfx_pipeline_state.optimal_key) || + !zink_can_use_pipeline_libs(ctx)) + return zink_create_gfx_program(ctx, stages, vertices_per_patch); + /* ensure async gpl creation is done */ + util_queue_fence_wait(&stages[MESA_SHADER_VERTEX]->precompile.fence); + util_queue_fence_wait(&stages[MESA_SHADER_FRAGMENT]->precompile.fence); + + struct zink_gfx_program *prog = create_program(ctx, false); + if (!prog) + goto fail; + + prog->ctx = ctx; + prog->is_separable = true; + + prog->shaders[MESA_SHADER_VERTEX] = stages[MESA_SHADER_VERTEX]; + prog->stages_remaining = prog->stages_present = shader_stages; + prog->shaders[MESA_SHADER_FRAGMENT] = stages[MESA_SHADER_FRAGMENT]; + prog->last_vertex_stage = stages[MESA_SHADER_VERTEX]; + _mesa_set_init(&prog->libs, prog, hash_pipeline_lib, equals_pipeline_lib); + + unsigned refs = 0; + for (int i = 0; i < ZINK_GFX_SHADER_COUNT; ++i) { + if (prog->shaders[i]) { + simple_mtx_lock(&prog->shaders[i]->lock); + _mesa_set_add(prog->shaders[i]->programs, prog); + simple_mtx_unlock(&prog->shaders[i]->lock); + refs++; + } + } + /* We can do this add after the _mesa_set_adds above because we know the prog->shaders[] are + * referenced by the draw state and zink_shader_free() can't be called on them while we're in here. + */ + p_atomic_add(&prog->base.reference.count, refs); + + for (int r = 0; r < ARRAY_SIZE(prog->pipelines); ++r) { + for (int i = 0; i < ARRAY_SIZE(prog->pipelines[0]); ++i) { + _mesa_hash_table_init(&prog->pipelines[r][i], prog, NULL, zink_get_gfx_pipeline_eq_func(screen, prog)); + /* only need first 3/4 for point/line/tri/patch */ + if (screen->info.have_EXT_extended_dynamic_state && + i == (prog->last_vertex_stage->nir->info.stage == MESA_SHADER_TESS_EVAL ? 4 : 3)) + break; + } + } + + if (prog->shaders[MESA_SHADER_VERTEX]->precompile.dsl) { + prog->base.dd.binding_usage |= BITFIELD_BIT(0); + prog->base.dd.db_template[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.db_template; + prog->base.dd.db_size[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.db_size; + prog->base.dd.db_offset[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.db_offset; + prog->base.dsl[prog->base.num_dsl] = prog->shaders[MESA_SHADER_VERTEX]->precompile.dsl; + prog->base.num_dsl++; + } + if (prog->shaders[MESA_SHADER_FRAGMENT]->precompile.dsl) { + prog->base.dd.binding_usage |= BITFIELD_BIT(1); + prog->base.dd.db_template[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.db_template; + prog->base.dd.db_size[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.db_size; + prog->base.dd.db_offset[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.db_offset; + prog->base.dsl[prog->base.num_dsl] = prog->shaders[MESA_SHADER_FRAGMENT]->precompile.dsl; + /* guarantee a null dsl if vs doesn't have descriptors */ + prog->base.num_dsl = 2; + } + prog->base.dd.bindless = prog->shaders[MESA_SHADER_VERTEX]->bindless | prog->shaders[MESA_SHADER_FRAGMENT]->bindless; + if (prog->base.dd.bindless) { + prog->base.num_dsl = screen->compact_descriptors ? ZINK_DESCRIPTOR_ALL_TYPES - ZINK_DESCRIPTOR_COMPACT : ZINK_DESCRIPTOR_ALL_TYPES; + prog->base.dsl[screen->desc_set_id[ZINK_DESCRIPTOR_BINDLESS]] = screen->bindless_layout; + } + prog->base.layout = zink_pipeline_layout_create(screen, prog->base.dsl, prog->base.num_dsl, false, VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT); + + VkPipeline libs[] = {stages[MESA_SHADER_VERTEX]->precompile.gpl, stages[MESA_SHADER_FRAGMENT]->precompile.gpl}; + prog->last_variant_hash = ctx->gfx_pipeline_state.optimal_key; + + struct zink_gfx_library_key *gkey = rzalloc(prog, struct zink_gfx_library_key); + gkey->optimal_key = prog->last_variant_hash; + assert(gkey->optimal_key); + gkey->pipeline = zink_create_gfx_pipeline_combined(screen, prog, VK_NULL_HANDLE, libs, 2, VK_NULL_HANDLE, false); + _mesa_set_add(&prog->libs, gkey); + + util_queue_add_job(&screen->cache_get_thread, prog, &prog->base.cache_fence, create_linked_separable_job, NULL, 0); + + return prog; +fail: + if (prog) + zink_destroy_gfx_program(screen, prog); + return NULL; +} + static uint32_t hash_compute_pipeline_state_local_size(const void *key) { @@ -1203,6 +1350,8 @@ zink_destroy_gfx_program(struct zink_screen *screen, max_idx++; } + if (prog->is_separable) + zink_gfx_program_reference(screen, &prog->full_prog, NULL); for (unsigned r = 0; r < ARRAY_SIZE(prog->pipelines); r++) { for (int i = 0; i < max_idx; ++i) { hash_table_foreach(&prog->pipelines[r][i], entry) { @@ -1223,11 +1372,13 @@ zink_destroy_gfx_program(struct zink_screen *screen, _mesa_set_remove_key(prog->shaders[i]->programs, prog); prog->shaders[i] = NULL; } - destroy_shader_cache(screen, &prog->shader_cache[i][0][0]); - destroy_shader_cache(screen, &prog->shader_cache[i][0][1]); - destroy_shader_cache(screen, &prog->shader_cache[i][1][0]); - destroy_shader_cache(screen, &prog->shader_cache[i][1][1]); - ralloc_free(prog->nir[i]); + if (!prog->is_separable) { + destroy_shader_cache(screen, &prog->shader_cache[i][0][0]); + destroy_shader_cache(screen, &prog->shader_cache[i][0][1]); + destroy_shader_cache(screen, &prog->shader_cache[i][1][0]); + destroy_shader_cache(screen, &prog->shader_cache[i][1][1]); + ralloc_free(prog->nir[i]); + } } set_foreach_remove(&prog->libs, he) { @@ -1761,6 +1912,20 @@ precompile_job(void *data, void *gdata, int thread_index) zink_screen_update_pipeline_cache(screen, &prog->base, true); } +static void +precompile_separate_shader_job(void *data, void *gdata, int thread_index) +{ + struct zink_screen *screen = gdata; + struct zink_shader *zs = data; + + nir_shader *nir; + zs->precompile.mod = zink_shader_compile_separate(screen, zs, &nir); + zink_descriptor_shader_init(screen, zs); + VkShaderModule mods[ZINK_GFX_SHADER_COUNT] = {0}; + mods[nir->info.stage] = zs->precompile.mod; + zs->precompile.gpl = zink_create_gfx_pipeline_separate(screen, mods, zs->precompile.layout); +} + static void zink_link_gfx_shader(struct pipe_context *pctx, void **shaders) { @@ -1769,8 +1934,17 @@ zink_link_gfx_shader(struct pipe_context *pctx, void **shaders) if (shaders[MESA_SHADER_COMPUTE]) return; /* can't precompile fixedfunc */ - if (!shaders[MESA_SHADER_VERTEX] || !shaders[MESA_SHADER_FRAGMENT]) + if (!shaders[MESA_SHADER_VERTEX] || !shaders[MESA_SHADER_FRAGMENT]) { + if (shaders[MESA_SHADER_VERTEX] || shaders[MESA_SHADER_FRAGMENT]) { + struct zink_shader *zs = shaders[MESA_SHADER_VERTEX] ? shaders[MESA_SHADER_VERTEX] : shaders[MESA_SHADER_FRAGMENT]; + if (zs->nir->info.separate_shader && !zs->precompile.mod && util_queue_fence_is_signalled(&zs->precompile.fence) && + zink_descriptor_mode == ZINK_DESCRIPTOR_MODE_DB && + /* sample shading can't precompile */ + (!shaders[MESA_SHADER_FRAGMENT] || !zs->nir->info.fs.uses_sample_shading)) + util_queue_add_job(&zink_screen(pctx->screen)->cache_get_thread, zs, &zs->precompile.fence, precompile_separate_shader_job, NULL, 0); + } return; + } unsigned hash = 0; unsigned shader_stages = 0; for (unsigned i = 0; i < ZINK_GFX_SHADER_COUNT; i++) { diff --git a/src/gallium/drivers/zink/zink_program_state.hpp b/src/gallium/drivers/zink/zink_program_state.hpp index dba466455cc..45550cee83a 100644 --- a/src/gallium/drivers/zink/zink_program_state.hpp +++ b/src/gallium/drivers/zink/zink_program_state.hpp @@ -190,10 +190,12 @@ zink_get_gfx_pipeline(struct zink_context *ctx, /* this is the graphics pipeline library path: find/construct all partial pipelines */ struct set_entry *he = _mesa_set_search(&prog->libs, &ctx->gfx_pipeline_state.optimal_key); struct zink_gfx_library_key *gkey; - if (he) + if (he) { gkey = (struct zink_gfx_library_key *)he->key; - else + } else { + assert(!prog->is_separable); gkey = zink_create_pipeline_lib(screen, prog, &ctx->gfx_pipeline_state); + } struct zink_gfx_input_key *ikey = DYNAMIC_STATE == ZINK_DYNAMIC_VERTEX_INPUT ? zink_find_or_create_input_dynamic(ctx, vkmode) : zink_find_or_create_input(ctx, vkmode); @@ -215,7 +217,7 @@ zink_get_gfx_pipeline(struct zink_context *ctx, zink_screen_update_pipeline_cache(screen, &prog->base, false); pc_entry->pipeline = pipeline; - if (HAVE_LIB) + if (HAVE_LIB && !prog->is_separable) /* trigger async optimized pipeline compile if this was the fast-linked unoptimized pipeline */ zink_gfx_program_compile_queue(ctx, pc_entry); } diff --git a/src/gallium/drivers/zink/zink_screen.c b/src/gallium/drivers/zink/zink_screen.c index 144fe4617fd..c5fd4377bed 100644 --- a/src/gallium/drivers/zink/zink_screen.c +++ b/src/gallium/drivers/zink/zink_screen.c @@ -188,6 +188,8 @@ zink_is_parallel_shader_compilation_finished(struct pipe_screen *screen, void *s } struct zink_shader *zs = shader; + if (!util_queue_fence_is_signalled(&zs->precompile.fence)) + return false; bool finished = true; set_foreach(zs->programs, entry) { struct zink_gfx_program *prog = (void*)entry->key; diff --git a/src/gallium/drivers/zink/zink_types.h b/src/gallium/drivers/zink/zink_types.h index b81a1d0e2d0..806429d28c2 100644 --- a/src/gallium/drivers/zink/zink_types.h +++ b/src/gallium/drivers/zink/zink_types.h @@ -732,6 +732,19 @@ struct zink_shader { bool has_uniforms; struct spirv_shader *spirv; + struct { + struct util_queue_fence fence; + VkShaderModule mod; + VkDescriptorSetLayout dsl; + VkPipelineLayout layout; + VkPipeline gpl; + VkDescriptorSetLayoutBinding *bindings; + unsigned num_bindings; + struct zink_descriptor_template *db_template; + unsigned db_size; + unsigned *db_offset; + } precompile; + simple_mtx_t lock; struct set *programs; @@ -973,26 +986,30 @@ struct zink_gfx_pipeline_cache_entry { struct zink_gfx_program { struct zink_program base; + bool is_separable; //not a full program struct zink_context *ctx; //the owner context uint32_t stages_present; //mask of stages present in this program uint32_t stages_remaining; //mask of zink_shader remaining in this program - struct nir_shader *nir[ZINK_GFX_SHADER_COUNT]; - - VkShaderModule modules[ZINK_GFX_SHADER_COUNT]; // compute stage doesn't belong here - uint32_t module_hash[ZINK_GFX_SHADER_COUNT]; - - struct zink_shader *last_vertex_stage; - - struct util_dynarray shader_cache[ZINK_GFX_SHADER_COUNT][2][2]; //normal, nonseamless cubes, inline uniforms - unsigned inlined_variant_count[ZINK_GFX_SHADER_COUNT]; struct zink_shader *shaders[ZINK_GFX_SHADER_COUNT]; - struct hash_table pipelines[2][11]; // [dynamic, renderpass][number of draw modes we support] + struct zink_shader *last_vertex_stage; + + /* full */ + VkShaderModule modules[ZINK_GFX_SHADER_COUNT]; // compute stage doesn't belong here + uint32_t module_hash[ZINK_GFX_SHADER_COUNT]; + struct nir_shader *nir[ZINK_GFX_SHADER_COUNT]; + struct util_dynarray shader_cache[ZINK_GFX_SHADER_COUNT][2][2]; //normal, nonseamless cubes, inline uniforms + unsigned inlined_variant_count[ZINK_GFX_SHADER_COUNT]; uint32_t default_variant_hash; - uint32_t last_variant_hash; uint8_t inline_variants; //which stages are using inlined uniforms + /* separable */ + struct zink_gfx_program *full_prog; + + struct hash_table pipelines[2][11]; // [dynamic, renderpass][number of draw modes we support] + uint32_t last_variant_hash; + uint32_t last_finalized_hash[2][4]; //[dynamic, renderpass][primtype idx] VkPipeline last_pipeline[2][4]; //[dynamic, renderpass][primtype idx]