freedreno/drm/sp: Implement deferred submit merging
For submits flushed with (a) no required fence, and (b) no externally visible effects (ie. imported/exported bo), we can defer flushing the submit and merge it into a later submit. This is a bit more work in userspace, but it cuts down the number of submit ioctls. And a common case is that later submits overlap in the bo's used (for example, blit upload to a buffer, which is then used in the following draw pass), so it reduces the net amount of work needed to be done in the kernel to handle the submit ioctl. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/19 Signed-off-by: Rob Clark <robdclark@chromium.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10444>
This commit is contained in:
@@ -177,6 +177,7 @@ msm_pipe_destroy(struct fd_pipe *pipe)
|
||||
static const struct fd_pipe_funcs sp_funcs = {
|
||||
.ringbuffer_new_object = msm_ringbuffer_sp_new_object,
|
||||
.submit_new = msm_submit_sp_new,
|
||||
.flush = msm_pipe_sp_flush,
|
||||
.get_param = msm_pipe_get_param,
|
||||
.wait = msm_pipe_wait,
|
||||
.destroy = msm_pipe_destroy,
|
||||
|
||||
@@ -67,6 +67,7 @@ struct fd_ringbuffer *msm_ringbuffer_sp_new_object(struct fd_pipe *pipe,
|
||||
|
||||
struct fd_submit *msm_submit_new(struct fd_pipe *pipe);
|
||||
struct fd_submit *msm_submit_sp_new(struct fd_pipe *pipe);
|
||||
void msm_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence);
|
||||
|
||||
void msm_pipe_sp_ringpool_init(struct msm_pipe *msm_pipe);
|
||||
void msm_pipe_sp_ringpool_fini(struct msm_pipe *msm_pipe);
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
#include <inttypes.h>
|
||||
|
||||
#include "util/hash_table.h"
|
||||
#include "util/os_file.h"
|
||||
#include "util/slab.h"
|
||||
|
||||
#include "drm/freedreno_ringbuffer.h"
|
||||
@@ -57,6 +58,12 @@ struct msm_submit_sp {
|
||||
* so we can reclaim extra space at it's end.
|
||||
*/
|
||||
struct fd_ringbuffer *suballoc_ring;
|
||||
|
||||
/* Flush args, potentially attached to the last submit in the list
|
||||
* of submits to merge:
|
||||
*/
|
||||
int in_fence_fd;
|
||||
struct fd_submit_fence *out_fence;
|
||||
};
|
||||
FD_DEFINE_CAST(fd_submit, msm_submit_sp);
|
||||
|
||||
@@ -108,7 +115,7 @@ msm_submit_append_bo(struct msm_submit_sp *submit, struct fd_bo *bo)
|
||||
|
||||
/* NOTE: it is legal to use the same bo on different threads for
|
||||
* different submits. But it is not legal to use the same submit
|
||||
* from given threads.
|
||||
* from different threads.
|
||||
*/
|
||||
idx = READ_ONCE(msm_bo->idx);
|
||||
|
||||
@@ -213,10 +220,12 @@ msm_submit_sp_new_ringbuffer(struct fd_submit *submit, uint32_t size,
|
||||
* 2) Add cmdstream bos to bos table
|
||||
* 3) Update bo fences
|
||||
*/
|
||||
static void
|
||||
msm_submit_sp_flush_prep(struct fd_submit *submit)
|
||||
static bool
|
||||
msm_submit_sp_flush_prep(struct fd_submit *submit, int in_fence_fd,
|
||||
struct fd_submit_fence *out_fence)
|
||||
{
|
||||
struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
|
||||
bool has_shared = false;
|
||||
|
||||
finalize_current_cmd(submit->primary);
|
||||
|
||||
@@ -227,43 +236,93 @@ msm_submit_sp_flush_prep(struct fd_submit *submit)
|
||||
msm_submit_append_bo(msm_submit, primary->u.cmds[i].ring_bo);
|
||||
|
||||
simple_mtx_lock(&table_lock);
|
||||
for (unsigned i = 0; i < msm_submit->nr_bos; i++)
|
||||
for (unsigned i = 0; i < msm_submit->nr_bos; i++) {
|
||||
fd_bo_add_fence(msm_submit->bos[i], submit->pipe, submit->fence);
|
||||
has_shared |= msm_submit->bos[i]->shared;
|
||||
}
|
||||
simple_mtx_unlock(&table_lock);
|
||||
|
||||
msm_submit->out_fence = out_fence;
|
||||
msm_submit->in_fence_fd = (in_fence_fd == -1) ?
|
||||
-1 : os_dupfd_cloexec(in_fence_fd);
|
||||
|
||||
return has_shared;
|
||||
}
|
||||
|
||||
static int
|
||||
msm_submit_sp_flush_finish(struct fd_submit *submit, int in_fence_fd,
|
||||
struct fd_submit_fence *out_fence)
|
||||
flush_submit_list(struct list_head *submit_list)
|
||||
{
|
||||
struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
|
||||
struct msm_pipe *msm_pipe = to_msm_pipe(submit->pipe);
|
||||
struct msm_submit_sp *msm_submit = to_msm_submit_sp(last_submit(submit_list));
|
||||
struct msm_pipe *msm_pipe = to_msm_pipe(msm_submit->base.pipe);
|
||||
struct drm_msm_gem_submit req = {
|
||||
.flags = msm_pipe->pipe,
|
||||
.queueid = msm_pipe->queue_id,
|
||||
};
|
||||
int ret;
|
||||
|
||||
struct msm_ringbuffer_sp *primary =
|
||||
to_msm_ringbuffer_sp(submit->primary);
|
||||
struct drm_msm_gem_submit_cmd cmds[primary->u.nr_cmds];
|
||||
unsigned nr_cmds = 0;
|
||||
|
||||
for (unsigned i = 0; i < primary->u.nr_cmds; i++) {
|
||||
cmds[i].type = MSM_SUBMIT_CMD_BUF;
|
||||
cmds[i].submit_idx =
|
||||
msm_submit_append_bo(msm_submit, primary->u.cmds[i].ring_bo);
|
||||
cmds[i].submit_offset = primary->offset;
|
||||
cmds[i].size = primary->u.cmds[i].size;
|
||||
cmds[i].pad = 0;
|
||||
cmds[i].nr_relocs = 0;
|
||||
/* Determine the number of extra cmds's from deferred submits that
|
||||
* we will be merging in:
|
||||
*/
|
||||
foreach_submit (submit, submit_list) {
|
||||
assert(submit->pipe == &msm_pipe->base);
|
||||
nr_cmds += to_msm_ringbuffer_sp(submit->primary)->u.nr_cmds;
|
||||
}
|
||||
|
||||
if (in_fence_fd != -1) {
|
||||
struct drm_msm_gem_submit_cmd cmds[nr_cmds];
|
||||
|
||||
unsigned cmd_idx = 0;
|
||||
|
||||
/* Build up the table of cmds, and for all but the last submit in the
|
||||
* list, merge their bo tables into the last submit.
|
||||
*/
|
||||
foreach_submit_safe (submit, submit_list) {
|
||||
struct msm_ringbuffer_sp *deferred_primary =
|
||||
to_msm_ringbuffer_sp(submit->primary);
|
||||
|
||||
for (unsigned i = 0; i < deferred_primary->u.nr_cmds; i++) {
|
||||
cmds[cmd_idx].type = MSM_SUBMIT_CMD_BUF;
|
||||
cmds[cmd_idx].submit_idx =
|
||||
msm_submit_append_bo(msm_submit, deferred_primary->u.cmds[i].ring_bo);
|
||||
cmds[cmd_idx].submit_offset = deferred_primary->offset;
|
||||
cmds[cmd_idx].size = deferred_primary->u.cmds[i].size;
|
||||
cmds[cmd_idx].pad = 0;
|
||||
cmds[cmd_idx].nr_relocs = 0;
|
||||
|
||||
cmd_idx++;
|
||||
}
|
||||
|
||||
/* We are merging all the submits in the list into the last submit,
|
||||
* so the remainder of the loop body doesn't apply to the last submit
|
||||
*/
|
||||
if (submit == last_submit(submit_list)) {
|
||||
DEBUG_MSG("merged %u submits", cmd_idx);
|
||||
break;
|
||||
}
|
||||
|
||||
struct msm_submit_sp *msm_deferred_submit = to_msm_submit_sp(submit);
|
||||
for (unsigned i = 0; i < msm_deferred_submit->nr_bos; i++) {
|
||||
/* Note: if bo is used in both the current submit and the deferred
|
||||
* submit being merged, we expect to hit the fast-path as we add it
|
||||
* to the current submit:
|
||||
*/
|
||||
msm_submit_append_bo(msm_submit, msm_deferred_submit->bos[i]);
|
||||
}
|
||||
|
||||
/* Now that the cmds/bos have been transfered over to the current submit,
|
||||
* we can remove the deferred submit from the list and drop it's reference
|
||||
*/
|
||||
list_del(&submit->node);
|
||||
fd_submit_del(submit);
|
||||
}
|
||||
|
||||
if (msm_submit->in_fence_fd != -1) {
|
||||
req.flags |= MSM_SUBMIT_FENCE_FD_IN | MSM_SUBMIT_NO_IMPLICIT;
|
||||
req.fence_fd = in_fence_fd;
|
||||
req.fence_fd = msm_submit->in_fence_fd;
|
||||
}
|
||||
|
||||
if (out_fence && out_fence->use_fence_fd) {
|
||||
if (msm_submit->out_fence && msm_submit->out_fence->use_fence_fd) {
|
||||
req.flags |= MSM_SUBMIT_FENCE_FD_OUT;
|
||||
}
|
||||
|
||||
@@ -289,35 +348,146 @@ msm_submit_sp_flush_finish(struct fd_submit *submit, int in_fence_fd,
|
||||
submit_bos[i].presumed = 0;
|
||||
}
|
||||
|
||||
req.bos = VOID2U64(submit_bos), req.nr_bos = msm_submit->nr_bos;
|
||||
req.cmds = VOID2U64(cmds), req.nr_cmds = primary->u.nr_cmds;
|
||||
req.bos = VOID2U64(submit_bos);
|
||||
req.nr_bos = msm_submit->nr_bos;
|
||||
req.cmds = VOID2U64(cmds);
|
||||
req.nr_cmds = nr_cmds;
|
||||
|
||||
DEBUG_MSG("nr_cmds=%u, nr_bos=%u", req.nr_cmds, req.nr_bos);
|
||||
|
||||
ret = drmCommandWriteRead(submit->pipe->dev->fd, DRM_MSM_GEM_SUBMIT, &req,
|
||||
ret = drmCommandWriteRead(msm_pipe->base.dev->fd, DRM_MSM_GEM_SUBMIT, &req,
|
||||
sizeof(req));
|
||||
if (ret) {
|
||||
ERROR_MSG("submit failed: %d (%s)", ret, strerror(errno));
|
||||
msm_dump_submit(&req);
|
||||
} else if (!ret && out_fence) {
|
||||
out_fence->fence.kfence = req.fence;
|
||||
out_fence->fence.ufence = submit->fence;
|
||||
out_fence->fence_fd = req.fence_fd;
|
||||
} else if (!ret && msm_submit->out_fence) {
|
||||
msm_submit->out_fence->fence.kfence = req.fence;
|
||||
msm_submit->out_fence->fence.ufence = msm_submit->base.fence;
|
||||
msm_submit->out_fence->fence_fd = req.fence_fd;
|
||||
}
|
||||
|
||||
if (!bos_on_stack)
|
||||
free(submit_bos);
|
||||
|
||||
if (msm_submit->in_fence_fd != -1)
|
||||
close(msm_submit->in_fence_fd);
|
||||
|
||||
fd_submit_del(&msm_submit->base);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool
|
||||
should_defer(struct fd_submit *submit)
|
||||
{
|
||||
struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
|
||||
|
||||
/* if too many bo's, it may not be worth the CPU cost of submit merging: */
|
||||
if (msm_submit->nr_bos > 30)
|
||||
return false;
|
||||
|
||||
/* On the kernel side, with 32K ringbuffer, we have an upper limit of 2k
|
||||
* cmds before we exceed the size of the ringbuffer, which results in
|
||||
* deadlock writing into the RB (ie. kernel doesn't finish writing into
|
||||
* the RB so it doesn't kick the GPU to start consuming from the RB)
|
||||
*/
|
||||
if (submit->pipe->dev->deferred_cmds > 128)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int
|
||||
msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd,
|
||||
struct fd_submit_fence *out_fence)
|
||||
{
|
||||
msm_submit_sp_flush_prep(submit);
|
||||
struct fd_device *dev = submit->pipe->dev;
|
||||
|
||||
return msm_submit_sp_flush_finish(submit, in_fence_fd, out_fence);
|
||||
/* Acquire lock before flush_prep() because it is possible to race between
|
||||
* this and pipe->flush():
|
||||
*/
|
||||
simple_mtx_lock(&dev->submit_lock);
|
||||
|
||||
/* If there are deferred submits from another fd_pipe, flush them now,
|
||||
* since we can't merge submits from different submitqueue's (ie. they
|
||||
* could have different priority, etc)
|
||||
*/
|
||||
if (!list_is_empty(&dev->deferred_submits) &&
|
||||
(last_submit(&dev->deferred_submits)->pipe != submit->pipe)) {
|
||||
struct list_head submit_list;
|
||||
|
||||
list_replace(&dev->deferred_submits, &submit_list);
|
||||
list_inithead(&dev->deferred_submits);
|
||||
dev->deferred_cmds = 0;
|
||||
|
||||
simple_mtx_unlock(&dev->submit_lock);
|
||||
flush_submit_list(&submit_list);
|
||||
simple_mtx_lock(&dev->submit_lock);
|
||||
}
|
||||
|
||||
list_addtail(&fd_submit_ref(submit)->node, &dev->deferred_submits);
|
||||
|
||||
bool has_shared = msm_submit_sp_flush_prep(submit, in_fence_fd, out_fence);
|
||||
|
||||
/* If we don't need an out-fence, we can defer the submit.
|
||||
*
|
||||
* TODO we could defer submits with in-fence as well.. if we took our own
|
||||
* reference to the fd, and merged all the in-fence-fd's when we flush the
|
||||
* deferred submits
|
||||
*/
|
||||
if ((in_fence_fd == -1) && !out_fence && !has_shared && should_defer(submit)) {
|
||||
dev->deferred_cmds += fd_ringbuffer_cmd_count(submit->primary);
|
||||
assert(dev->deferred_cmds == fd_dev_count_deferred_cmds(dev));
|
||||
simple_mtx_unlock(&dev->submit_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct list_head submit_list;
|
||||
|
||||
list_replace(&dev->deferred_submits, &submit_list);
|
||||
list_inithead(&dev->deferred_submits);
|
||||
dev->deferred_cmds = 0;
|
||||
|
||||
simple_mtx_unlock(&dev->submit_lock);
|
||||
|
||||
return flush_submit_list(&submit_list);
|
||||
}
|
||||
|
||||
void
|
||||
msm_pipe_sp_flush(struct fd_pipe *pipe, uint32_t fence)
|
||||
{
|
||||
struct fd_device *dev = pipe->dev;
|
||||
struct list_head submit_list;
|
||||
|
||||
list_inithead(&submit_list);
|
||||
|
||||
simple_mtx_lock(&dev->submit_lock);
|
||||
|
||||
foreach_submit_safe (deferred_submit, &dev->deferred_submits) {
|
||||
/* We should never have submits from multiple pipes in the deferred
|
||||
* list. If we did, we couldn't compare their fence to our fence,
|
||||
* since each fd_pipe is an independent timeline.
|
||||
*/
|
||||
if (deferred_submit->pipe != pipe)
|
||||
break;
|
||||
|
||||
if (fd_fence_after(deferred_submit->fence, fence))
|
||||
break;
|
||||
|
||||
list_del(&deferred_submit->node);
|
||||
list_addtail(&deferred_submit->node, &submit_list);
|
||||
dev->deferred_cmds -= fd_ringbuffer_cmd_count(deferred_submit->primary);
|
||||
}
|
||||
|
||||
assert(dev->deferred_cmds == fd_dev_count_deferred_cmds(dev));
|
||||
|
||||
simple_mtx_unlock(&dev->submit_lock);
|
||||
|
||||
if (list_is_empty(&submit_list))
|
||||
return;
|
||||
|
||||
flush_submit_list(&submit_list);
|
||||
}
|
||||
|
||||
static void
|
||||
|
||||
Reference in New Issue
Block a user