intel/xe: unify behavior with i915.ko regarding ENOMEM on DRM_IOCTL_XE_EXEC

When the system is under memory pressure (which can happen, for
example, during CI runs), don't immediately give up the exec ioctl
(which, for Vulkan, will result in the device being declared lost).
Instead, retry a little bit just like we do for i915.ko.

This is a trade-off.

One of the reasons to *not* have unified behavior regarding ENOMEM
between i915.ko and xe.ko is the fact that xe.ko uses vm_bind, so if
the user tried to bind more memory than it is able to, we'll just keep
getting ENOMEM as long as we retry the ioctl. We now have a retry
limit, so we'll eventually return the error.

On the other hand, if the problem is other applications consuming all
the memory, having the retry loop may really help avoid unnecessarily
marking the device as lost, since one of our retries may eventually
succeed.

I believe the tradeoff of "we'll now eventually succeed in some cases
where it's possible to succeed, at the expense of retrying for a few
seconds until giving up in cases where we would never be able to
succeed" is an improvement.

If xe.ko ever gives us a way to differentiate between the two
different reasons for ENOMEM, we'll be able to make things much
better. We can also tune our timeouts if needed.

Reviewed-by: José Roberto de Souza <jose.souza@intel.com>
Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37559>
This commit is contained in:
Paulo Zanoni
2025-09-29 10:25:04 -07:00
committed by Marge Bot
parent 680daeea63
commit 6d6b22b734
3 changed files with 36 additions and 11 deletions
@@ -26,6 +26,7 @@
#include "common/intel_debug_identifier.h"
#include "common/intel_gem.h"
#include "common/xe/intel_gem.h"
#include "dev/intel_debug.h"
#include "iris/iris_bufmgr.h"
#include "iris/iris_batch.h"
@@ -408,13 +409,12 @@ xe_batch_submit(struct iris_batch *batch)
.syncs = (uintptr_t)syncs,
.num_syncs = sync_len,
};
if (likely(!batch->screen->devinfo->no_hw)) {
ret = intel_ioctl(iris_bufmgr_get_fd(bufmgr), DRM_IOCTL_XE_EXEC, &exec);
if (ret) {
ret = -errno;
goto error_exec;
}
ret = xe_gem_exec_ioctl(iris_bufmgr_get_fd(bufmgr), batch->screen->devinfo,
&exec);
if (ret) {
ret = -errno;
goto error_exec;
}
if (!iris_implicit_sync_export(batch, &implicit_sync))
+28
View File
@@ -27,7 +27,10 @@
#include <stdint.h>
#include <time.h>
#include "common/intel_gem.h"
#include "common/intel_engine.h"
#include "drm-uapi/xe_drm.h"
#include "util/os_time.h"
bool xe_gem_read_render_timestamp(int fd, uint64_t *value);
bool
@@ -42,3 +45,28 @@ bool xe_gem_can_render_on_fd(int fd);
bool xe_gem_supports_protected_exec_queue(int fd);
void intel_xe_gem_add_ext(uint64_t *ptr, uint32_t ext_name, void *data);
static inline int
xe_gem_exec_ioctl(int fd, const struct intel_device_info *info,
struct drm_xe_exec *exec)
{
int ret, retries;
if (unlikely(info->no_hw))
return 0;
/* After 80 retries, we spent more than 16s sleeping. */
for (retries = 0; retries < 80; retries++) {
ret = intel_ioctl(fd, DRM_IOCTL_XE_EXEC, exec);
if (likely(!(ret && errno == ENOMEM)))
break;
if (unlikely(retries == 40))
fprintf(stderr, "intel: the execbuf ioctl keeps returning ENOMEM\n");
os_time_sleep(100 * retries * retries);
}
return ret;
}
+2 -5
View File
@@ -26,6 +26,7 @@
#include "anv_private.h"
#include "anv_measure.h"
#include "common/intel_bind_timeline.h"
#include "common/xe/intel_gem.h"
#include "perf/intel_perf.h"
#include "drm-uapi/xe_drm.h"
@@ -139,12 +140,8 @@ xe_exec_ioctl_impl(struct anv_queue *queue, struct drm_xe_exec *exec,
const char *func, int line)
{
struct anv_device *device = queue->device;
int ret;
if (unlikely(device->info->no_hw))
return VK_SUCCESS;
ret = intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, exec);
int ret = xe_gem_exec_ioctl(device->fd, device->info, exec);
if (ret)
return vk_queue_set_lost(&queue->vk, "%s(%d) failed: %m", func, line);