v3dv: add a CPU path for buffer to image copies

The blit shader path for buffer to image copies is pretty bad,
since it needs to produce a tiled image from the linear buffer
prior to emitting the blit copy.

This patch adds a new preferential path where we implement the
copy using the CPU, similar to what the GL driver does for
texture uploads. This makes vkQuake2 at least 4x faster when
dynamic lights are enabled (which triggers dynamic texture
updates).

We also tested a GPU path where we use a shader that takes the
linear buffer as a UBO and copies directly from it. This also
shows a clear performance gain, but still worse than the CPU
implementation.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6766>
This commit is contained in:
Iago Toral Quiroga
2020-06-15 16:44:49 +02:00
committed by Marge Bot
parent 1e57995609
commit 1f8343b875
3 changed files with 138 additions and 6 deletions
+68
View File
@@ -2537,6 +2537,72 @@ copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
return true;
}
/**
* Returns true if the implementation supports the requested operation (even if
* it failed to process it, for example, due to an out-of-memory error).
*/
static bool
copy_buffer_to_image_cpu(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_image *image,
struct v3dv_buffer *buffer,
const VkBufferImageCopy *region)
{
/* FIXME */
if (vk_format_is_depth_or_stencil(image->vk_format))
return false;
if (vk_format_is_compressed(image->vk_format))
return false;
if (image->tiling == VK_IMAGE_TILING_LINEAR)
return false;
uint32_t buffer_width, buffer_height;
if (region->bufferRowLength == 0)
buffer_width = region->imageExtent.width;
else
buffer_width = region->bufferRowLength;
if (region->bufferImageHeight == 0)
buffer_height = region->imageExtent.height;
else
buffer_height = region->bufferImageHeight;
uint32_t buffer_stride = buffer_width * image->cpp;
uint32_t buffer_layer_stride = buffer_stride * buffer_height;
uint32_t num_layers;
if (image->type != VK_IMAGE_TYPE_3D)
num_layers = region->imageSubresource.layerCount;
else
num_layers = region->imageExtent.depth;
assert(num_layers > 0);
struct v3dv_job *job =
v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
cmd_buffer, -1);
if (!job)
return true;
job->cpu.copy_buffer_to_image.image = image;
job->cpu.copy_buffer_to_image.buffer = buffer;
job->cpu.copy_buffer_to_image.buffer_stride = buffer_stride;
job->cpu.copy_buffer_to_image.buffer_layer_stride = buffer_layer_stride;
job->cpu.copy_buffer_to_image.buffer_offset = region->bufferOffset;
job->cpu.copy_buffer_to_image.image_extent = region->imageExtent;
job->cpu.copy_buffer_to_image.image_offset = region->imageOffset;
job->cpu.copy_buffer_to_image.mip_level =
region->imageSubresource.mipLevel;
job->cpu.copy_buffer_to_image.base_layer =
region->imageSubresource.baseArrayLayer;
job->cpu.copy_buffer_to_image.layer_count = num_layers;
list_addtail(&job->list_link, &cmd_buffer->jobs);
return true;
}
void
v3dv_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
VkBuffer srcBuffer,
@@ -2554,6 +2620,8 @@ v3dv_CmdCopyBufferToImage(VkCommandBuffer commandBuffer,
continue;
if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, &pRegions[i]))
continue;
if (copy_buffer_to_image_cpu(cmd_buffer, image, buffer, &pRegions[i]))
continue;
if (copy_buffer_to_image_blit(cmd_buffer, image, buffer, &pRegions[i]))
continue;
unreachable("Unsupported buffer to image copy.");
+21 -6
View File
@@ -656,6 +656,7 @@ enum v3dv_job_type {
V3DV_JOB_TYPE_CPU_SET_EVENT,
V3DV_JOB_TYPE_CPU_WAIT_EVENTS,
V3DV_JOB_TYPE_CPU_CLEAR_ATTACHMENTS,
V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE,
};
struct v3dv_reset_query_cpu_job_info {
@@ -700,6 +701,19 @@ struct v3dv_clear_attachments_cpu_job_info {
VkClearRect *rects;
};
struct v3dv_copy_buffer_to_image_cpu_job_info {
struct v3dv_image *image;
struct v3dv_buffer *buffer;
uint32_t buffer_offset;
uint32_t buffer_stride;
uint32_t buffer_layer_stride;
VkOffset3D image_offset;
VkExtent3D image_extent;
uint32_t mip_level;
uint32_t base_layer;
uint32_t layer_count;
};
struct v3dv_job {
struct list_head list_link;
@@ -757,12 +771,13 @@ struct v3dv_job {
/* Job specs for CPU jobs */
union {
struct v3dv_reset_query_cpu_job_info query_reset;
struct v3dv_end_query_cpu_job_info query_end;
struct v3dv_copy_query_results_cpu_job_info query_copy_results;
struct v3dv_event_set_cpu_job_info event_set;
struct v3dv_event_wait_cpu_job_info event_wait;
struct v3dv_clear_attachments_cpu_job_info clear_attachments;
struct v3dv_reset_query_cpu_job_info query_reset;
struct v3dv_end_query_cpu_job_info query_end;
struct v3dv_copy_query_results_cpu_job_info query_copy_results;
struct v3dv_event_set_cpu_job_info event_set;
struct v3dv_event_wait_cpu_job_info event_wait;
struct v3dv_clear_attachments_cpu_job_info clear_attachments;
struct v3dv_copy_buffer_to_image_cpu_job_info copy_buffer_to_image;
} cpu;
/* Job spects for TFU jobs */
+49
View File
@@ -389,6 +389,53 @@ handle_wait_events_cpu_job(struct v3dv_job *job,
return VK_NOT_READY;
}
static VkResult
handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job)
{
assert(job->type == V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE);
struct v3dv_copy_buffer_to_image_cpu_job_info *info =
&job->cpu.copy_buffer_to_image;
/* Wait for all GPU work to finish first, since we may be accessing
* the BOs involved in the operation.
*/
v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue));
/* Map BOs */
struct v3dv_bo *dst_bo = info->image->mem->bo;
if (!dst_bo->map && !v3dv_bo_map(job->device, dst_bo, dst_bo->size))
return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
void *dst_ptr = dst_bo->map;
struct v3dv_bo *src_bo = info->buffer->mem->bo;
if (!src_bo->map && !v3dv_bo_map(job->device, src_bo, src_bo->size))
return vk_error(job->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
void *src_ptr = src_bo->map;
const struct v3d_resource_slice *slice =
&info->image->slices[info->mip_level];
const struct pipe_box box = {
info->image_offset.x, info->image_offset.y, info->base_layer,
info->image_extent.width, info->image_extent.height, info->layer_count,
};
/* Copy each layer */
for (uint32_t i = 0; i < info->layer_count; i++) {
const uint32_t dst_offset =
v3dv_layer_offset(info->image, info->mip_level, info->base_layer + i);
const uint32_t src_offset =
info->buffer->mem_offset + info->buffer_offset +
info->buffer_layer_stride * i;
v3d_store_tiled_image(
dst_ptr + dst_offset, slice->stride,
src_ptr + src_offset, info->buffer_stride,
slice->tiling, info->image->cpp, slice->padded_height, &box);
}
return VK_SUCCESS;
}
static VkResult
process_semaphores_to_signal(struct v3dv_device *device,
uint32_t count, const VkSemaphore *sems)
@@ -569,6 +616,8 @@ queue_submit_job(struct v3dv_queue *queue,
return handle_set_event_cpu_job(job, wait_thread != NULL);
case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
return handle_wait_events_cpu_job(job, do_wait, wait_thread);
case V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE:
return handle_copy_buffer_to_image_cpu_job(job);
default:
unreachable("Unhandled job type");
}