diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c index fb0747ff907..5c2e83dfe83 100644 --- a/src/broadcom/vulkan/v3dv_cmd_buffer.c +++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c @@ -2730,7 +2730,7 @@ update_fs_variant(struct v3dv_cmd_buffer *cmd_buffer) VK_PIPELINE_BIND_POINT_GRAPHICS); VkResult vk_result; - variant = v3dv_get_shader_variant(p_stage, &local_key.base, + variant = v3dv_get_shader_variant(p_stage, NULL, &local_key.base, sizeof(struct v3d_fs_key), &cmd_buffer->device->alloc, &vk_result); @@ -2761,7 +2761,7 @@ update_vs_variant(struct v3dv_cmd_buffer *cmd_buffer) cmd_buffer_populate_v3d_key(&local_key.base, cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS); - variant = v3dv_get_shader_variant(p_stage, &local_key.base, + variant = v3dv_get_shader_variant(p_stage, NULL, &local_key.base, sizeof(struct v3d_vs_key), &cmd_buffer->device->alloc, &vk_result); @@ -2782,7 +2782,7 @@ update_vs_variant(struct v3dv_cmd_buffer *cmd_buffer) cmd_buffer_populate_v3d_key(&local_key.base, cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS); - variant = v3dv_get_shader_variant(p_stage, &local_key.base, + variant = v3dv_get_shader_variant(p_stage, NULL, &local_key.base, sizeof(struct v3d_vs_key), &cmd_buffer->device->alloc, &vk_result); @@ -2813,7 +2813,7 @@ update_cs_variant(struct v3dv_cmd_buffer *cmd_buffer) VK_PIPELINE_BIND_POINT_COMPUTE); VkResult result; - variant = v3dv_get_shader_variant(p_stage, &local_key, + variant = v3dv_get_shader_variant(p_stage, NULL, &local_key, sizeof(struct v3d_key), &cmd_buffer->device->alloc, &result); diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c index 9e77656ebca..0d4dacf852e 100644 --- a/src/broadcom/vulkan/v3dv_pipeline.c +++ b/src/broadcom/vulkan/v3dv_pipeline.c @@ -1288,18 +1288,22 @@ pipeline_stage_create_vs_bin(const struct v3dv_pipeline_stage *src, } /* FIXME: right now this just asks for an bo for the exact size of the qpu - * assembly. It would be good to be slighly smarter and having one "all - * shaders" bo per pipeline, so each p_stage/variant would save their offset - * on such. That is really relevant due the fact that bo are always aligned to - * 4096, so that would allow to use less memory. + * assembly. It would be good to be able to re-use bos to avoid bo + * fragmentation. This could be tricky though, as right now we are uploading + * the assembly from two paths, when compiling a shader, or when deserializing + * from the pipeline cache. This also means that the same variant can be + * shared by different objects. So with the current approach it is clear who + * owns the assembly bo, but if shared, who owns the shared bo? * * For now one-bo per-assembly would work. * * Returns false if it was not able to allocate or map the assembly bo memory. */ static bool -upload_assembly(struct v3dv_pipeline_stage *p_stage, +upload_assembly(struct v3dv_device *device, struct v3dv_shader_variant *variant, + gl_shader_stage stage, + bool is_coord, const void *data, uint32_t size) { @@ -1308,11 +1312,10 @@ upload_assembly(struct v3dv_pipeline_stage *p_stage, * have any bo */ assert(variant->assembly_bo == NULL); - struct v3dv_device *device = p_stage->pipeline->device; - switch (p_stage->stage) { + switch (stage) { case MESA_SHADER_VERTEX: - name = (p_stage->is_coord == true) ? "coord_shader_assembly" : + name = (is_coord == true) ? "coord_shader_assembly" : "vertex_shader_assembly"; break; case MESA_SHADER_FRAGMENT: @@ -1340,92 +1343,30 @@ upload_assembly(struct v3dv_pipeline_stage *p_stage, memcpy(bo->map, data, size); - v3dv_bo_unmap(device, bo); - + /* We don't unmap the assembly bo, as we would use to gather the assembly + * when serializing the variant. + */ variant->assembly_bo = bo; return true; } -/* For a given key, it returns the compiled version of the shader. If it was - * already compiled, it gets it from the p_stage cache, if not it compiles is - * through the v3d compiler +/* + * Adds a shader variant to the pipeline shader variant cache, updates + * pipeline spill structures if needed. * - * If the method returns NULL it means that it was not able to allocate the - * resources for the variant. out_vk_result would return which OOM applies. - * - * Returns a new reference of the shader_variant to the caller. + * Assumes that the caller already checked that the variant is not on such + * cache. */ -struct v3dv_shader_variant* -v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage, - struct v3d_key *key, - size_t key_size, - const VkAllocationCallbacks *pAllocator, - VkResult *out_vk_result) +static void +pipeline_add_variant_to_cache(struct v3dv_pipeline_stage *p_stage, + struct v3d_key *key, + size_t key_size, + struct v3dv_shader_variant *variant) { struct hash_table *ht = p_stage->cache; - struct hash_entry *entry = _mesa_hash_table_search(ht, key); - - if (entry) { - *out_vk_result = VK_SUCCESS; - v3dv_shader_variant_ref(entry->data); - return entry->data; - } - struct v3dv_pipeline *pipeline = p_stage->pipeline; struct v3dv_device *device = pipeline->device; - struct v3dv_shader_variant *variant = - vk_zalloc(&device->alloc, sizeof(*variant), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - - if (variant == NULL) { - *out_vk_result = VK_ERROR_OUT_OF_HOST_MEMORY; - return NULL; - } - variant->ref_cnt = 1; - - struct v3dv_physical_device *physical_device = - &pipeline->device->instance->physicalDevice; - const struct v3d_compiler *compiler = physical_device->compiler; - - uint32_t variant_id = p_atomic_inc_return(&p_stage->compiled_variant_count); - - if (V3D_DEBUG & (V3D_DEBUG_NIR | - v3d_debug_flag_for_shader_stage(p_stage->stage))) { - fprintf(stderr, "Just before v3d_compile: %s prog %d variant %d NIR:\n", - gl_shader_stage_name(p_stage->stage), - p_stage->program_id, - variant_id); - nir_print_shader(p_stage->nir, stderr); - fprintf(stderr, "\n"); - } - - uint64_t *qpu_insts; - uint32_t qpu_insts_size; - - qpu_insts = v3d_compile(compiler, - key, &variant->prog_data.base, - p_stage->nir, - shader_debug_output, NULL, - p_stage->program_id, - variant_id, - &qpu_insts_size); - - if (!qpu_insts) { - fprintf(stderr, "Failed to compile %s prog %d NIR to VIR\n", - gl_shader_stage_name(p_stage->stage), - p_stage->program_id); - } else { - if (!upload_assembly(p_stage, variant, qpu_insts, qpu_insts_size)) { - free(qpu_insts); - v3dv_shader_variant_unref(device, variant); - - *out_vk_result = VK_ERROR_OUT_OF_DEVICE_MEMORY; - return NULL; - } - } - - free(qpu_insts); if (ht) { struct v3d_key *dup_key; @@ -1450,8 +1391,184 @@ v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage, v3dv_bo_alloc(device, total_spill_size, "spill", true); pipeline->spill.size_per_thread = variant->prog_data.base->spill_size; } +} + + +static void +pipeline_hash_variant(const struct v3dv_pipeline_stage *p_stage, + struct v3d_key *key, + size_t key_size, + unsigned char *sha1_out) +{ + struct mesa_sha1 ctx; + struct v3dv_pipeline *pipeline = p_stage->pipeline; + _mesa_sha1_init(&ctx); + + if (p_stage->stage == MESA_SHADER_COMPUTE) { + _mesa_sha1_update(&ctx, p_stage->shader_sha1, sizeof(p_stage->shader_sha1)); + } else { + /* We need to include both on the sha1 key as one could affect the other + * during linking (like if vertex output are constants, then the + * fragment shader would load_const intead of load_input). An + * alternative would be to use the serialized nir, but that seems like + * an overkill + */ + _mesa_sha1_update(&ctx, pipeline->vs->shader_sha1, + sizeof(pipeline->vs->shader_sha1)); + _mesa_sha1_update(&ctx, pipeline->fs->shader_sha1, + sizeof(pipeline->fs->shader_sha1)); + } + _mesa_sha1_update(&ctx, key, key_size); + + _mesa_sha1_final(&ctx, sha1_out); +} + +/* + * Creates a new shader_variant_create. Note that for prog_data is const, so + * it is used only to copy to their own prog_data + * + * Creation includes allocating a shader source bo, and filling it up. + */ +struct v3dv_shader_variant * +v3dv_shader_variant_create(struct v3dv_device *device, + gl_shader_stage stage, + bool is_coord, + const unsigned char *variant_sha1, + struct v3d_prog_data *prog_data, + uint32_t prog_data_size, + const uint64_t *qpu_insts, + uint32_t qpu_insts_size, + VkResult *out_vk_result) +{ + struct v3dv_shader_variant *variant = + vk_zalloc(&device->alloc, sizeof(*variant), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + if (variant == NULL) { + *out_vk_result = VK_ERROR_OUT_OF_HOST_MEMORY; + return NULL; + } + + variant->ref_cnt = 1; + variant->stage = stage; + variant->is_coord = is_coord; + memcpy(variant->variant_sha1, variant_sha1, sizeof(variant->variant_sha1)); + variant->prog_data_size = prog_data_size; + variant->prog_data.base = prog_data; + + if (qpu_insts) { + if (!upload_assembly(device, variant, stage, is_coord, + qpu_insts, qpu_insts_size)) { + ralloc_free(variant->prog_data.base); + vk_free(&device->alloc, variant); + + *out_vk_result = VK_ERROR_OUT_OF_DEVICE_MEMORY; + return NULL; + } + variant->qpu_insts_size = qpu_insts_size; + } *out_vk_result = VK_SUCCESS; + + return variant; +} + +/* For a given key, it returns the compiled version of the shader. If it was + * already compiled, it gets it from the p_stage cache, if not it compiles is + * through the v3d compiler + * + * If the method returns NULL it means that it was not able to allocate the + * resources for the variant. out_vk_result would return which OOM applies. + * + * Returns a new reference of the shader_variant to the caller. + */ +struct v3dv_shader_variant* +v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage, + struct v3dv_pipeline_cache *cache, + struct v3d_key *key, + size_t key_size, + const VkAllocationCallbacks *pAllocator, + VkResult *out_vk_result) +{ + /* We first try to get the variant from the internal p_stage cache + * variant + */ + struct hash_table *ht = p_stage->cache; + struct hash_entry *entry = _mesa_hash_table_search(ht, key); + + if (entry) { + *out_vk_result = VK_SUCCESS; + v3dv_shader_variant_ref(entry->data); + return entry->data; + } + + /* Now we search on the pipeline cache if available */ + struct v3dv_pipeline *pipeline = p_stage->pipeline; + unsigned char variant_sha1[20]; + pipeline_hash_variant(p_stage, key, key_size, variant_sha1); + + struct v3dv_shader_variant *variant = + v3dv_pipeline_cache_search_for_variant(pipeline, + cache, + variant_sha1); + + if (variant) { + pipeline_add_variant_to_cache(p_stage, key, key_size, variant); + *out_vk_result = VK_SUCCESS; + return variant; + } + + /* If we don't find the variant in any cache, we compile one and add the + * variant to the cache + */ + struct v3dv_device *device = pipeline->device; + struct v3dv_physical_device *physical_device = + &pipeline->device->instance->physicalDevice; + const struct v3d_compiler *compiler = physical_device->compiler; + + uint32_t variant_id = p_atomic_inc_return(&p_stage->compiled_variant_count); + + if (V3D_DEBUG & (V3D_DEBUG_NIR | + v3d_debug_flag_for_shader_stage(p_stage->stage))) { + fprintf(stderr, "Just before v3d_compile: %s prog %d variant %d NIR:\n", + gl_shader_stage_name(p_stage->stage), + p_stage->program_id, + variant_id); + nir_print_shader(p_stage->nir, stderr); + fprintf(stderr, "\n"); + } + + uint64_t *qpu_insts; + uint32_t qpu_insts_size; + struct v3d_prog_data *prog_data; + + qpu_insts = v3d_compile(compiler, + key, &prog_data, + p_stage->nir, + shader_debug_output, NULL, + p_stage->program_id, + variant_id, + &qpu_insts_size); + + if (!qpu_insts) { + fprintf(stderr, "Failed to compile %s prog %d NIR to VIR\n", + gl_shader_stage_name(p_stage->stage), + p_stage->program_id); + } + + variant = v3dv_shader_variant_create(device, p_stage->stage, p_stage->is_coord, + variant_sha1, + prog_data, v3d_prog_data_size(p_stage->stage), + qpu_insts, qpu_insts_size, + out_vk_result); + if (qpu_insts) + free(qpu_insts); + + if (*out_vk_result == VK_SUCCESS) { + pipeline_add_variant_to_cache(p_stage, key, key_size, variant); + v3dv_pipeline_cache_upload_variant(pipeline, cache, variant); + } + return variant; } @@ -1731,6 +1848,12 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline, next_stage = stages[stage]; } + /* Assign p_stage to the pipeline. We need to do this before start to + * compile because p_stage sha1 is computed with all the stages + */ + pipeline->vs = stages[MESA_SHADER_VERTEX]; + pipeline->fs = stages[MESA_SHADER_FRAGMENT]; + /* Compiling to vir. Note that at this point we are compiling a default * variant. Binding to textures, and other stuff (that would need a * cmd_buffer) would need a recompile @@ -1757,7 +1880,6 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline, */ lower_vs_io(p_stage->nir); - pipeline->vs = p_stage; pipeline->vs_bin = pipeline_stage_create_vs_bin(pipeline->vs, pAllocator); /* FIXME: likely this to be moved to a gather info method to a full @@ -1776,7 +1898,7 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline, pipeline_populate_v3d_vs_key(key, pCreateInfo, pipeline->vs); VkResult vk_result; pipeline->vs->current_variant = - v3dv_get_shader_variant(pipeline->vs, &key->base, sizeof(*key), + v3dv_get_shader_variant(pipeline->vs, cache, &key->base, sizeof(*key), pAllocator, &vk_result); if (vk_result != VK_SUCCESS) return vk_result; @@ -1784,7 +1906,7 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline, key = &pipeline->vs_bin->key.vs; pipeline_populate_v3d_vs_key(key, pCreateInfo, pipeline->vs_bin); pipeline->vs_bin->current_variant = - v3dv_get_shader_variant(pipeline->vs_bin, &key->base, sizeof(*key), + v3dv_get_shader_variant(pipeline->vs_bin, cache, &key->base, sizeof(*key), pAllocator, &vk_result); if (vk_result != VK_SUCCESS) return vk_result; @@ -1794,8 +1916,6 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline, case MESA_SHADER_FRAGMENT: { struct v3d_fs_key *key = &p_stage->key.fs; - pipeline->fs = p_stage; - pipeline_populate_v3d_fs_key(key, pCreateInfo, p_stage, get_ucp_enable_mask(stages)); @@ -1803,7 +1923,7 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline, VkResult vk_result; p_stage->current_variant = - v3dv_get_shader_variant(p_stage, &key->base, sizeof(*key), + v3dv_get_shader_variant(p_stage, cache, &key->base, sizeof(*key), pAllocator, &vk_result); if (vk_result != VK_SUCCESS) return vk_result; @@ -2821,7 +2941,7 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline, VkResult result; p_stage->current_variant = - v3dv_get_shader_variant(p_stage, key, sizeof(*key), alloc, &result); + v3dv_get_shader_variant(p_stage, cache, key, sizeof(*key), alloc, &result); return result; } diff --git a/src/broadcom/vulkan/v3dv_pipeline_cache.c b/src/broadcom/vulkan/v3dv_pipeline_cache.c index d0874d1326a..7d290a54ed8 100644 --- a/src/broadcom/vulkan/v3dv_pipeline_cache.c +++ b/src/broadcom/vulkan/v3dv_pipeline_cache.c @@ -56,6 +56,10 @@ cache_dump_stats(struct v3dv_pipeline_cache *cache) fprintf(stderr, " NIR cache entries: %d\n", cache->nir_stats.count); fprintf(stderr, " NIR cache miss count: %d\n", cache->nir_stats.miss); fprintf(stderr, " NIR cache hit count: %d\n", cache->nir_stats.hit); + + fprintf(stderr, " variant cache entries: %d\n", cache->variant_stats.count); + fprintf(stderr, " variant cache miss count: %d\n", cache->variant_stats.miss); + fprintf(stderr, " variant cache hit count: %d\n", cache->variant_stats.hit); } void @@ -186,12 +190,154 @@ pipeline_cache_init(struct v3dv_pipeline_cache *cache, cache->nir_stats.miss = 0; cache->nir_stats.hit = 0; cache->nir_stats.count = 0; + + cache->variant_cache = _mesa_hash_table_create(NULL, sha1_hash_func, + sha1_compare_func); + cache->variant_stats.miss = 0; + cache->variant_stats.hit = 0; + cache->variant_stats.count = 0; } else { cache->nir_cache = NULL; + cache->variant_cache = NULL; } } +struct v3dv_shader_variant* +v3dv_pipeline_cache_search_for_variant(struct v3dv_pipeline *pipeline, + struct v3dv_pipeline_cache *cache, + unsigned char sha1_key[20]) +{ + if (!cache || !cache->nir_cache) + return NULL; + + if (unlikely(dump_stats)) { + char sha1buf[41]; + _mesa_sha1_format(sha1buf, sha1_key); + + fprintf(stderr, "pipeline cache %p, search variant with key %s\n", cache, sha1buf); + } + + pthread_mutex_lock(&cache->mutex); + + struct hash_entry *entry = + _mesa_hash_table_search(cache->variant_cache, sha1_key); + + if (entry) { + struct v3dv_shader_variant *variant = + (struct v3dv_shader_variant *) entry->data; + + if (unlikely(dump_stats)) { + fprintf(stderr, "\tcache hit: %p\n", variant); + cache->variant_stats.hit++; + cache_dump_stats(cache); + } + + if (variant) + v3dv_shader_variant_ref(variant); + + pthread_mutex_unlock(&cache->mutex); + return variant; + } + + if (unlikely(dump_stats)) { + fprintf(stderr, "\tcache miss\n"); + cache->variant_stats.miss++; + cache_dump_stats(cache); + } + + pthread_mutex_unlock(&cache->mutex); + return NULL; +} + +void +v3dv_pipeline_cache_upload_variant(struct v3dv_pipeline *pipeline, + struct v3dv_pipeline_cache *cache, + struct v3dv_shader_variant *variant) +{ + if (!cache || !cache->variant_cache) + return; + + pthread_mutex_lock(&cache->mutex); + struct hash_entry *entry = + _mesa_hash_table_search(cache->variant_cache, variant->variant_sha1); + + if (entry) { + pthread_mutex_unlock(&cache->mutex); + return; + } + + v3dv_shader_variant_ref(variant); + _mesa_hash_table_insert(cache->variant_cache, variant->variant_sha1, variant); + if (unlikely(dump_stats)) { + char sha1buf[41]; + _mesa_sha1_format(sha1buf, variant->variant_sha1); + + fprintf(stderr, "pipeline cache %p, new variant entry with key %s\n\t%p\n", + cache, sha1buf, variant); + cache->variant_stats.count++; + cache_dump_stats(cache); + } + + pthread_mutex_unlock(&cache->mutex); +} + +static struct v3dv_shader_variant* +shader_variant_create_from_blob(struct v3dv_device *device, + struct blob_reader *blob) +{ + VkResult result; + + gl_shader_stage stage = blob_read_uint32(blob); + bool is_coord = blob_read_uint8(blob); + + const unsigned char *variant_sha1 = blob_read_bytes(blob, 20); + + uint32_t prog_data_size = blob_read_uint32(blob); + /* FIXME: as we include the stage perhaps we can avoid prog_data_size? */ + assert(prog_data_size == v3d_prog_data_size(stage)); + + const void *prog_data = blob_read_bytes(blob, prog_data_size); + if (blob->overrun) + return NULL; + + uint32_t ulist_count = blob_read_uint32(blob); + uint32_t contents_size = sizeof(enum quniform_contents) * ulist_count; + const void *contents_data = blob_read_bytes(blob, contents_size); + if (blob->overrun) + return NULL; + + uint ulist_data_size = sizeof(uint32_t) * ulist_count; + const void *ulist_data_data = blob_read_bytes(blob, ulist_data_size); + if (blob->overrun) + return NULL; + + uint32_t qpu_insts_size = blob_read_uint32(blob); + const uint64_t *qpu_insts = blob_read_bytes(blob, qpu_insts_size); + if (blob->overrun) + return NULL; + + /* shader_variant_create expects a newly created prog_data for their own, + * as it is what the v3d compiler returns. So we are also allocating one + * (including the uniform list) and filled it up with the data that we read + * from the blob + */ + struct v3d_prog_data *new_prog_data = rzalloc_size(NULL, prog_data_size); + memcpy(new_prog_data, prog_data, prog_data_size); + struct v3d_uniform_list *ulist = &new_prog_data->uniforms; + ulist->count = ulist_count; + ulist->contents = ralloc_array(new_prog_data, enum quniform_contents, ulist->count); + memcpy(ulist->contents, contents_data, contents_size); + ulist->data = ralloc_array(new_prog_data, uint32_t, ulist->count); + memcpy(ulist->data, ulist_data_data, ulist_data_size); + + return v3dv_shader_variant_create(device, stage, is_coord, + variant_sha1, + new_prog_data, prog_data_size, + qpu_insts, qpu_insts_size, + &result); +} + static void pipeline_cache_load(struct v3dv_pipeline_cache *cache, size_t size, @@ -201,6 +347,21 @@ pipeline_cache_load(struct v3dv_pipeline_cache *cache, struct v3dv_physical_device *pdevice = &device->instance->physicalDevice; struct vk_pipeline_cache_header header; + if (cache->variant_cache == NULL) + return; + + struct blob_reader blob; + blob_reader_init(&blob, data, size); + + blob_copy_bytes(&blob, &header, sizeof(header)); + uint32_t count = blob_read_uint32(&blob); + if (blob.overrun) + return; + + if (unlikely(dump_stats)) { + fprintf(stderr, "pipeline cache %p, loading %i variant entries\n", cache, count); + } + if (size < sizeof(header)) return; memcpy(&header, data, sizeof(header)); @@ -215,9 +376,16 @@ pipeline_cache_load(struct v3dv_pipeline_cache *cache, if (memcmp(header.uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE) != 0) return; - /* FIXME: at this point we only verify the header but we dont really load - * any data. pending to implement serialize/deserialize among other things. - */ + for (uint32_t i = 0; i < count; i++) { + struct v3dv_shader_variant *variant = + shader_variant_create_from_blob(device, &blob); + if (!variant) + break; + _mesa_hash_table_insert(cache->variant_cache, variant->variant_sha1, variant); + if (unlikely(dump_stats)) + cache->variant_stats.count++; + } + } VkResult @@ -271,6 +439,15 @@ v3dv_DestroyPipelineCache(VkDevice _device, ralloc_free(entry->data); _mesa_hash_table_destroy(cache->nir_cache, NULL); + + hash_table_foreach(cache->variant_cache, entry) { + struct v3dv_shader_variant *variant = entry->data; + if (variant) + v3dv_shader_variant_unref(device, variant); + } + + _mesa_hash_table_destroy(cache->variant_cache, NULL); + } vk_free2(&device->alloc, pAllocator, cache); @@ -288,6 +465,30 @@ v3dv_MergePipelineCaches(VkDevice device, return VK_SUCCESS; } +static bool +shader_variant_write_to_blob(const struct v3dv_shader_variant *variant, + struct blob *blob) +{ + blob_write_uint32(blob, variant->stage); + blob_write_uint8(blob, variant->is_coord); + + blob_write_bytes(blob, variant->variant_sha1, sizeof(variant->variant_sha1)); + + blob_write_uint32(blob, variant->prog_data_size); + blob_write_bytes(blob, variant->prog_data.base, variant->prog_data_size); + + struct v3d_uniform_list *ulist = &variant->prog_data.base->uniforms; + blob_write_uint32(blob, ulist->count); + blob_write_bytes(blob, ulist->contents, sizeof(enum quniform_contents) * ulist->count); + blob_write_bytes(blob, ulist->data, sizeof(uint32_t) * ulist->count); + + blob_write_uint32(blob, variant->qpu_insts_size); + assert(variant->assembly_bo->map); + blob_write_bytes(blob, variant->assembly_bo->map, variant->qpu_insts_size); + + return !blob->out_of_memory; +} + VkResult v3dv_GetPipelineCacheData(VkDevice _device, VkPipelineCache _cache, @@ -296,32 +497,68 @@ v3dv_GetPipelineCacheData(VkDevice _device, { V3DV_FROM_HANDLE(v3dv_device, device, _device); V3DV_FROM_HANDLE(v3dv_pipeline_cache, cache, _cache); + + struct blob blob; + if (pData) { + blob_init_fixed(&blob, pData, *pDataSize); + } else { + blob_init_fixed(&blob, NULL, SIZE_MAX); + } + struct v3dv_physical_device *pdevice = &device->instance->physicalDevice; - struct vk_pipeline_cache_header *header; VkResult result = VK_SUCCESS; pthread_mutex_lock(&cache->mutex); - /* FIXME: at this point the cache data is just the header */ - const size_t size = sizeof(*header); - if (pData == NULL) { - pthread_mutex_unlock(&cache->mutex); - *pDataSize = size; - return VK_SUCCESS; - } - if (*pDataSize < sizeof(*header)) { - pthread_mutex_unlock(&cache->mutex); + struct vk_pipeline_cache_header header = { + .header_size = sizeof(struct vk_pipeline_cache_header), + .header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE, + .vendor_id = v3dv_physical_device_vendor_id(pdevice), + .device_id = v3dv_physical_device_device_id(pdevice), + }; + memcpy(header.uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE); + blob_write_bytes(&blob, &header, sizeof(header)); + + uint32_t count = 0; + intptr_t count_offset = blob_reserve_uint32(&blob); + if (count_offset < 0) { *pDataSize = 0; + blob_finish(&blob); + pthread_mutex_unlock(&cache->mutex); return VK_INCOMPLETE; } - header = pData; - header->header_size = sizeof(*header); - header->header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE; - header->vendor_id = v3dv_physical_device_vendor_id(pdevice); - header->device_id = v3dv_physical_device_device_id(pdevice); - memcpy(header->uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE); + if (cache->variant_cache) { + hash_table_foreach(cache->variant_cache, entry) { + struct v3dv_shader_variant *variant = entry->data; + + size_t save_size = blob.size; + if (!shader_variant_write_to_blob(variant, &blob)) { + /* If it fails reset to the previous size and bail */ + blob.size = save_size; + pthread_mutex_unlock(&cache->mutex); + result = VK_INCOMPLETE; + break; + } + + count++; + } + } + + blob_overwrite_uint32(&blob, count_offset, count); + + *pDataSize = blob.size; + + blob_finish(&blob); + + if (unlikely(dump_stats)) { + assert(count <= cache->variant_stats.count); + fprintf(stderr, "GetPipelineCacheData: serializing cache %p, " + "%i variant entries, %u DataSize\n", + cache, count, (uint32_t) *pDataSize); + } pthread_mutex_unlock(&cache->mutex); + return result; } diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h index a58063ed4ad..115840ace50 100644 --- a/src/broadcom/vulkan/v3dv_private.h +++ b/src/broadcom/vulkan/v3dv_private.h @@ -278,6 +278,9 @@ struct v3dv_pipeline_cache { struct hash_table *nir_cache; struct v3dv_pipeline_cache_stats nir_stats; + + struct hash_table *variant_cache; + struct v3dv_pipeline_cache_stats variant_stats; }; struct v3dv_device { @@ -1221,6 +1224,14 @@ vk_to_mesa_shader_stage(VkShaderStageFlagBits vk_stage) struct v3dv_shader_variant { uint32_t ref_cnt; + gl_shader_stage stage; + bool is_coord; + + /* key for the pipeline cache, it is p_stage shader_sha1 + v3d compiler + * sha1 + */ + unsigned char variant_sha1[20]; + union { struct v3d_prog_data *base; struct v3d_vs_prog_data *vs; @@ -1228,11 +1239,16 @@ struct v3dv_shader_variant { struct v3d_compute_prog_data *cs; } prog_data; + /* We explicitly save the prog_data_size as it would make easier to + * serialize + */ + uint32_t prog_data_size; /* FIXME: using one bo per shader. Eventually we would be interested on * reusing the same bo for all the shaders, like a bo per v3dv_pipeline for * shaders. */ struct v3dv_bo *assembly_bo; + uint32_t qpu_insts_size; }; /* @@ -1278,11 +1294,13 @@ struct v3dv_pipeline_stage { struct v3d_fs_key fs; } key; - /* Cache with all the shader variant. + /* Cache with all the shader variants built for this pipeline. This one is + * required over the pipeline cache because we still allow to create shader + * variants after Pipeline creation. */ struct hash_table *cache; - struct v3dv_shader_variant *current_variant; + struct v3dv_shader_variant*current_variant; /* FIXME: only make sense on vs, so perhaps a v3dv key like radv? or a kind * of pipe_draw_info @@ -1712,11 +1730,23 @@ struct v3dv_cl_reloc v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_ struct v3dv_shader_variant * v3dv_get_shader_variant(struct v3dv_pipeline_stage *p_stage, + struct v3dv_pipeline_cache *cache, struct v3d_key *key, size_t key_size, const VkAllocationCallbacks *pAllocator, VkResult *out_vk_result); +struct v3dv_shader_variant * +v3dv_shader_variant_create(struct v3dv_device *device, + gl_shader_stage stage, + bool is_coord, + const unsigned char *variant_sha1, + struct v3d_prog_data *prog_data, + uint32_t prog_data_size, + const uint64_t *qpu_insts, + uint32_t qpu_insts_size, + VkResult *out_vk_result); + void v3dv_shader_variant_destroy(struct v3dv_device *device, struct v3dv_shader_variant *variant); @@ -1786,6 +1816,16 @@ nir_shader* v3dv_pipeline_cache_search_for_nir(struct v3dv_pipeline *pipeline, const nir_shader_compiler_options *nir_options, unsigned char sha1_key[20]); +struct v3dv_shader_variant* +v3dv_pipeline_cache_search_for_variant(struct v3dv_pipeline *pipeline, + struct v3dv_pipeline_cache *cache, + unsigned char sha1_key[20]); + +void +v3dv_pipeline_cache_upload_variant(struct v3dv_pipeline *pipeline, + struct v3dv_pipeline_cache *cache, + struct v3dv_shader_variant *variant); + #define V3DV_DEFINE_HANDLE_CASTS(__v3dv_type, __VkType) \ \