tu: Optimize hash_renderpass_instance by removing XXH64_update
It was determined through testing that `XXH64_update` is significantly slower than calling `XXH64` directly as far as small data velocity is concerned. This function is called on every RP end which made it visible while profiling but substantial difference (measured to be ~4x) made it not show up whatsoever. Signed-off-by: Mark Collins <mark@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18428>
This commit is contained in:
@@ -144,40 +144,26 @@ free_submission_data(struct tu_submission_data *data)
|
||||
free(data);
|
||||
}
|
||||
|
||||
#define APPEND_TO_HASH(state, field) \
|
||||
XXH64_update(state, &field, sizeof(field));
|
||||
|
||||
static uint64_t
|
||||
hash_renderpass_instance(const struct tu_render_pass *pass,
|
||||
const struct tu_framebuffer *framebuffer,
|
||||
const struct tu_cmd_buffer *cmd) {
|
||||
XXH64_state_t hash_state;
|
||||
XXH64_reset(&hash_state, 0);
|
||||
uint32_t data[3 + pass->attachment_count * 5];
|
||||
uint32_t* ptr = data;
|
||||
|
||||
APPEND_TO_HASH(&hash_state, framebuffer->width);
|
||||
APPEND_TO_HASH(&hash_state, framebuffer->height);
|
||||
APPEND_TO_HASH(&hash_state, framebuffer->layers);
|
||||
|
||||
APPEND_TO_HASH(&hash_state, pass->attachment_count);
|
||||
XXH64_update(&hash_state, pass->attachments, pass->attachment_count * sizeof(pass->attachments[0]));
|
||||
*ptr++ = framebuffer->width;
|
||||
*ptr++ = framebuffer->height;
|
||||
*ptr++ = framebuffer->layers;
|
||||
|
||||
for (unsigned i = 0; i < pass->attachment_count; i++) {
|
||||
APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.width);
|
||||
APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->view.height);
|
||||
APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.format);
|
||||
APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.array_layers);
|
||||
APPEND_TO_HASH(&hash_state, cmd->state.attachments[i]->image->vk.mip_levels);
|
||||
*ptr++ = cmd->state.attachments[i]->view.width;
|
||||
*ptr++ = cmd->state.attachments[i]->view.height;
|
||||
*ptr++ = cmd->state.attachments[i]->image->vk.format;
|
||||
*ptr++ = cmd->state.attachments[i]->image->vk.array_layers;
|
||||
*ptr++ = cmd->state.attachments[i]->image->vk.mip_levels;
|
||||
}
|
||||
|
||||
APPEND_TO_HASH(&hash_state, pass->subpass_count);
|
||||
for (unsigned i = 0; i < pass->subpass_count; i++) {
|
||||
APPEND_TO_HASH(&hash_state, pass->subpasses[i].samples);
|
||||
APPEND_TO_HASH(&hash_state, pass->subpasses[i].input_count);
|
||||
APPEND_TO_HASH(&hash_state, pass->subpasses[i].color_count);
|
||||
APPEND_TO_HASH(&hash_state, pass->subpasses[i].resolve_count);
|
||||
}
|
||||
|
||||
return XXH64_digest(&hash_state);
|
||||
return XXH64(data, sizeof(data), pass->autotune_hash);
|
||||
}
|
||||
|
||||
static void
|
||||
|
||||
@@ -510,6 +510,27 @@ static void update_samples(struct tu_subpass *subpass,
|
||||
subpass->samples = samples;
|
||||
}
|
||||
|
||||
static void
|
||||
tu_render_pass_calc_hash(struct tu_render_pass *pass)
|
||||
{
|
||||
#define HASH(hash, data) XXH64(&(data), sizeof(data), hash)
|
||||
|
||||
uint64_t hash = HASH(0, pass->attachment_count);
|
||||
hash = XXH64(pass->attachments,
|
||||
pass->attachment_count * sizeof(pass->attachments[0]), hash);
|
||||
hash = HASH(hash, pass->subpass_count);
|
||||
for (unsigned i = 0; i < pass->subpass_count; i++) {
|
||||
hash = HASH(hash, pass->subpasses[i].samples);
|
||||
hash = HASH(hash, pass->subpasses[i].input_count);
|
||||
hash = HASH(hash, pass->subpasses[i].color_count);
|
||||
hash = HASH(hash, pass->subpasses[i].resolve_count);
|
||||
}
|
||||
|
||||
pass->autotune_hash = hash;
|
||||
|
||||
#undef HASH
|
||||
}
|
||||
|
||||
static void
|
||||
tu_render_pass_cond_config(struct tu_render_pass *pass)
|
||||
{
|
||||
@@ -926,13 +947,14 @@ tu_CreateRenderPass2(VkDevice _device,
|
||||
tu_render_pass_cond_config(pass);
|
||||
tu_render_pass_gmem_config(pass, device->physical_device);
|
||||
tu_render_pass_bandwidth_config(pass);
|
||||
tu_render_pass_calc_hash(pass);
|
||||
|
||||
for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) {
|
||||
tu_render_pass_add_subpass_dep(pass, &pCreateInfo->pDependencies[i]);
|
||||
}
|
||||
|
||||
tu_render_pass_add_implicit_deps(pass, pCreateInfo);
|
||||
|
||||
|
||||
*pRenderPass = tu_render_pass_to_handle(pass);
|
||||
|
||||
return VK_SUCCESS;
|
||||
@@ -1092,6 +1114,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,
|
||||
tu_render_pass_cond_config(pass);
|
||||
tu_render_pass_gmem_config(pass, device->physical_device);
|
||||
tu_render_pass_bandwidth_config(pass);
|
||||
tu_render_pass_calc_hash(pass);
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@@ -100,6 +100,7 @@ struct tu_render_pass
|
||||
uint32_t subpass_count;
|
||||
uint32_t gmem_pixels[TU_GMEM_LAYOUT_COUNT];
|
||||
uint32_t tile_align_w;
|
||||
uint64_t autotune_hash;
|
||||
|
||||
/* memory bandwidth costs (in bytes) for gmem / sysmem rendering */
|
||||
uint32_t gmem_bandwidth_per_pixel;
|
||||
|
||||
Reference in New Issue
Block a user