From 6d6b22b7341e30ea5fe4d7d1b9b3d683eb697e16 Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Mon, 29 Sep 2025 10:25:04 -0700 Subject: [PATCH] intel/xe: unify behavior with i915.ko regarding ENOMEM on DRM_IOCTL_XE_EXEC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the system is under memory pressure (which can happen, for example, during CI runs), don't immediately give up the exec ioctl (which, for Vulkan, will result in the device being declared lost). Instead, retry a little bit just like we do for i915.ko. This is a trade-off. One of the reasons to *not* have unified behavior regarding ENOMEM between i915.ko and xe.ko is the fact that xe.ko uses vm_bind, so if the user tried to bind more memory than it is able to, we'll just keep getting ENOMEM as long as we retry the ioctl. We now have a retry limit, so we'll eventually return the error. On the other hand, if the problem is other applications consuming all the memory, having the retry loop may really help avoid unnecessarily marking the device as lost, since one of our retries may eventually succeed. I believe the tradeoff of "we'll now eventually succeed in some cases where it's possible to succeed, at the expense of retrying for a few seconds until giving up in cases where we would never be able to succeed" is an improvement. If xe.ko ever gives us a way to differentiate between the two different reasons for ENOMEM, we'll be able to make things much better. We can also tune our timeouts if needed. Reviewed-by: José Roberto de Souza Signed-off-by: Paulo Zanoni Part-of: --- .../drivers/iris/xe/iris_kmd_backend.c | 12 ++++---- src/intel/common/xe/intel_gem.h | 28 +++++++++++++++++++ src/intel/vulkan/xe/anv_batch_chain.c | 7 ++--- 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/src/gallium/drivers/iris/xe/iris_kmd_backend.c b/src/gallium/drivers/iris/xe/iris_kmd_backend.c index 912a67b18c9..4ccf9c7f7b9 100644 --- a/src/gallium/drivers/iris/xe/iris_kmd_backend.c +++ b/src/gallium/drivers/iris/xe/iris_kmd_backend.c @@ -26,6 +26,7 @@ #include "common/intel_debug_identifier.h" #include "common/intel_gem.h" +#include "common/xe/intel_gem.h" #include "dev/intel_debug.h" #include "iris/iris_bufmgr.h" #include "iris/iris_batch.h" @@ -408,13 +409,12 @@ xe_batch_submit(struct iris_batch *batch) .syncs = (uintptr_t)syncs, .num_syncs = sync_len, }; - if (likely(!batch->screen->devinfo->no_hw)) { - ret = intel_ioctl(iris_bufmgr_get_fd(bufmgr), DRM_IOCTL_XE_EXEC, &exec); - if (ret) { - ret = -errno; - goto error_exec; - } + ret = xe_gem_exec_ioctl(iris_bufmgr_get_fd(bufmgr), batch->screen->devinfo, + &exec); + if (ret) { + ret = -errno; + goto error_exec; } if (!iris_implicit_sync_export(batch, &implicit_sync)) diff --git a/src/intel/common/xe/intel_gem.h b/src/intel/common/xe/intel_gem.h index d6796f0834f..3440fd5ee0b 100644 --- a/src/intel/common/xe/intel_gem.h +++ b/src/intel/common/xe/intel_gem.h @@ -27,7 +27,10 @@ #include #include +#include "common/intel_gem.h" #include "common/intel_engine.h" +#include "drm-uapi/xe_drm.h" +#include "util/os_time.h" bool xe_gem_read_render_timestamp(int fd, uint64_t *value); bool @@ -42,3 +45,28 @@ bool xe_gem_can_render_on_fd(int fd); bool xe_gem_supports_protected_exec_queue(int fd); void intel_xe_gem_add_ext(uint64_t *ptr, uint32_t ext_name, void *data); + +static inline int +xe_gem_exec_ioctl(int fd, const struct intel_device_info *info, + struct drm_xe_exec *exec) +{ + int ret, retries; + + if (unlikely(info->no_hw)) + return 0; + + /* After 80 retries, we spent more than 16s sleeping. */ + for (retries = 0; retries < 80; retries++) { + ret = intel_ioctl(fd, DRM_IOCTL_XE_EXEC, exec); + + if (likely(!(ret && errno == ENOMEM))) + break; + + if (unlikely(retries == 40)) + fprintf(stderr, "intel: the execbuf ioctl keeps returning ENOMEM\n"); + + os_time_sleep(100 * retries * retries); + } + + return ret; +} diff --git a/src/intel/vulkan/xe/anv_batch_chain.c b/src/intel/vulkan/xe/anv_batch_chain.c index f052b2d77df..4c9c5d294fc 100644 --- a/src/intel/vulkan/xe/anv_batch_chain.c +++ b/src/intel/vulkan/xe/anv_batch_chain.c @@ -26,6 +26,7 @@ #include "anv_private.h" #include "anv_measure.h" #include "common/intel_bind_timeline.h" +#include "common/xe/intel_gem.h" #include "perf/intel_perf.h" #include "drm-uapi/xe_drm.h" @@ -139,12 +140,8 @@ xe_exec_ioctl_impl(struct anv_queue *queue, struct drm_xe_exec *exec, const char *func, int line) { struct anv_device *device = queue->device; - int ret; - if (unlikely(device->info->no_hw)) - return VK_SUCCESS; - - ret = intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, exec); + int ret = xe_gem_exec_ioctl(device->fd, device->info, exec); if (ret) return vk_queue_set_lost(&queue->vk, "%s(%d) failed: %m", func, line);