intel/xe: unify behavior with i915.ko regarding ENOMEM on DRM_IOCTL_XE_EXEC
When the system is under memory pressure (which can happen, for example, during CI runs), don't immediately give up the exec ioctl (which, for Vulkan, will result in the device being declared lost). Instead, retry a little bit just like we do for i915.ko. This is a trade-off. One of the reasons to *not* have unified behavior regarding ENOMEM between i915.ko and xe.ko is the fact that xe.ko uses vm_bind, so if the user tried to bind more memory than it is able to, we'll just keep getting ENOMEM as long as we retry the ioctl. We now have a retry limit, so we'll eventually return the error. On the other hand, if the problem is other applications consuming all the memory, having the retry loop may really help avoid unnecessarily marking the device as lost, since one of our retries may eventually succeed. I believe the tradeoff of "we'll now eventually succeed in some cases where it's possible to succeed, at the expense of retrying for a few seconds until giving up in cases where we would never be able to succeed" is an improvement. If xe.ko ever gives us a way to differentiate between the two different reasons for ENOMEM, we'll be able to make things much better. We can also tune our timeouts if needed. Reviewed-by: José Roberto de Souza <jose.souza@intel.com> Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37559>
This commit is contained in:
@@ -26,6 +26,7 @@
|
||||
|
||||
#include "common/intel_debug_identifier.h"
|
||||
#include "common/intel_gem.h"
|
||||
#include "common/xe/intel_gem.h"
|
||||
#include "dev/intel_debug.h"
|
||||
#include "iris/iris_bufmgr.h"
|
||||
#include "iris/iris_batch.h"
|
||||
@@ -408,13 +409,12 @@ xe_batch_submit(struct iris_batch *batch)
|
||||
.syncs = (uintptr_t)syncs,
|
||||
.num_syncs = sync_len,
|
||||
};
|
||||
if (likely(!batch->screen->devinfo->no_hw)) {
|
||||
ret = intel_ioctl(iris_bufmgr_get_fd(bufmgr), DRM_IOCTL_XE_EXEC, &exec);
|
||||
|
||||
if (ret) {
|
||||
ret = -errno;
|
||||
goto error_exec;
|
||||
}
|
||||
ret = xe_gem_exec_ioctl(iris_bufmgr_get_fd(bufmgr), batch->screen->devinfo,
|
||||
&exec);
|
||||
if (ret) {
|
||||
ret = -errno;
|
||||
goto error_exec;
|
||||
}
|
||||
|
||||
if (!iris_implicit_sync_export(batch, &implicit_sync))
|
||||
|
||||
@@ -27,7 +27,10 @@
|
||||
#include <stdint.h>
|
||||
#include <time.h>
|
||||
|
||||
#include "common/intel_gem.h"
|
||||
#include "common/intel_engine.h"
|
||||
#include "drm-uapi/xe_drm.h"
|
||||
#include "util/os_time.h"
|
||||
|
||||
bool xe_gem_read_render_timestamp(int fd, uint64_t *value);
|
||||
bool
|
||||
@@ -42,3 +45,28 @@ bool xe_gem_can_render_on_fd(int fd);
|
||||
bool xe_gem_supports_protected_exec_queue(int fd);
|
||||
|
||||
void intel_xe_gem_add_ext(uint64_t *ptr, uint32_t ext_name, void *data);
|
||||
|
||||
static inline int
|
||||
xe_gem_exec_ioctl(int fd, const struct intel_device_info *info,
|
||||
struct drm_xe_exec *exec)
|
||||
{
|
||||
int ret, retries;
|
||||
|
||||
if (unlikely(info->no_hw))
|
||||
return 0;
|
||||
|
||||
/* After 80 retries, we spent more than 16s sleeping. */
|
||||
for (retries = 0; retries < 80; retries++) {
|
||||
ret = intel_ioctl(fd, DRM_IOCTL_XE_EXEC, exec);
|
||||
|
||||
if (likely(!(ret && errno == ENOMEM)))
|
||||
break;
|
||||
|
||||
if (unlikely(retries == 40))
|
||||
fprintf(stderr, "intel: the execbuf ioctl keeps returning ENOMEM\n");
|
||||
|
||||
os_time_sleep(100 * retries * retries);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
#include "anv_private.h"
|
||||
#include "anv_measure.h"
|
||||
#include "common/intel_bind_timeline.h"
|
||||
#include "common/xe/intel_gem.h"
|
||||
#include "perf/intel_perf.h"
|
||||
|
||||
#include "drm-uapi/xe_drm.h"
|
||||
@@ -139,12 +140,8 @@ xe_exec_ioctl_impl(struct anv_queue *queue, struct drm_xe_exec *exec,
|
||||
const char *func, int line)
|
||||
{
|
||||
struct anv_device *device = queue->device;
|
||||
int ret;
|
||||
|
||||
if (unlikely(device->info->no_hw))
|
||||
return VK_SUCCESS;
|
||||
|
||||
ret = intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, exec);
|
||||
int ret = xe_gem_exec_ioctl(device->fd, device->info, exec);
|
||||
if (ret)
|
||||
return vk_queue_set_lost(&queue->vk, "%s(%d) failed: %m", func, line);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user