From 19dba854be005142f70c36895622c5d46c3b9ba2 Mon Sep 17 00:00:00 2001
From: Hans-Kristian Arntzen <post@arntzen-software.no>
Date: Tue, 9 Jan 2024 15:33:54 +0100
Subject: [PATCH] wsi/x11: Rewrite implementation to always use threads.

The current implementation has many different code paths which get very
messy to reason about and maintain.

- FIFO mode worked well enough.
- IMMEDIATE did not need a thread at all, but present wait
  implementation complicated a lot of things since we had to handle
  concurrent special event reads.
- MAILBOX (and Xwayland) adds even more jank on top of this where
  have present thread, but no acquire thread, so there are tons of
  forward progress issues to consider.

In the new model, we have two threads:

- Queue thread is only responsible for receiving presents, waiting for
  them if necessary, and submitting them to X.
- Event thread pumps the special event queue and notifies
  other threads about frame completions.
- Application thread does not interact with X directly, only through
  acquire/present queues and present wait condvar.

Two threads are required to implement IMMEDIATE and MAILBOX well.
IDLE events can come back at any time and the queue thread might be
waiting for a new presentation request to come through.

This new model has the advantage that we will be able to implement
VK_EXT_swapchain_maintenance1 in a more reasonable way, since we can
just toggle the present mode per present request as all presentation
go through the same system.

Some cleanups were done as well:

- We no longer need the busy bool. Since everything goes through thread,
  we just rely on acquire/present queues.
- SW/non-MITSHM path is also moved to thread. Move acquire-specific
  logic to the thread as well.

Signed-off-by: Hans-Kristian Arntzen <post@arntzen-software.no>
Reviewed-By: Mike Blumenkrantz <michael.blumenkrantz@gmail.com>
Acked-by: Adam Jackson <ajax@redhat.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26954>
---
 src/vulkan/wsi/wsi_common_x11.c | 1060 ++++++++++---------------------
 1 file changed, 319 insertions(+), 741 deletions(-)

diff --git a/src/vulkan/wsi/wsi_common_x11.c b/src/vulkan/wsi/wsi_common_x11.c
index 0ac83108252..4894985190b 100644
--- a/src/vulkan/wsi/wsi_common_x11.c
+++ b/src/vulkan/wsi/wsi_common_x11.c
@@ -35,14 +35,12 @@
 #include <xcb/shm.h>
 
 #include "util/macros.h"
-#include <stdatomic.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <errno.h>
 #include <string.h>
 #include <fcntl.h>
-#include <poll.h>
 #include <xf86drm.h>
 #include "drm-uapi/drm_fourcc.h"
 #include "util/hash_table.h"
@@ -1033,7 +1031,6 @@ struct x11_image {
    xcb_pixmap_t                              pixmap;
    xcb_xfixes_region_t                       update_region; /* long lived XID */
    xcb_xfixes_region_t                       update_area;   /* the above or None */
-   atomic_bool                               busy;
    bool                                      present_queued;
    struct xshmfence *                        shm_fence;
    uint32_t                                  sync_fence;
@@ -1043,6 +1040,7 @@ struct x11_image {
    uint8_t *                                 shmaddr;
    uint64_t                                  present_id;
    uint64_t                                  signal_present_id;
+   VkPresentModeKHR                          present_mode;
 };
 
 struct x11_swapchain {
@@ -1065,48 +1063,29 @@ struct x11_swapchain {
    uint64_t                                     send_sbc;
    uint64_t                                     last_present_msc;
    uint32_t                                     stamp;
-   atomic_int                                   sent_image_count;
+   uint32_t                                     sent_image_count;
 
-   bool                                         has_present_queue;
-   bool                                         has_acquire_queue;
    VkResult                                     status;
    bool                                         copy_is_suboptimal;
    struct wsi_queue                             present_queue;
    struct wsi_queue                             acquire_queue;
    pthread_t                                    queue_manager;
+   pthread_t                                    event_manager;
 
-   /* Lock and condition variable that lets callers monitor forward progress in the swapchain.
-    * This includes:
-    * - Present ID completion updates (present_id).
-    * - Pending ID pending updates (present_id_pending).
-    * - Any errors happening while blocking on present progress updates (present_progress_error).
-    * - present_submitted_count.
-    */
+   /* Used for communicating between event_manager and queue_manager.
+    * Lock is also taken when reading and writing status.
+    * When reading status in application threads,
+    * x11_swapchain_read_status_atomic can be used as a wrapper function. */
+   pthread_mutex_t                              thread_state_lock;
+   pthread_cond_t                               thread_state_cond;
+
+   /* Lock and condition variable for present wait.
+    * Signalled by event thread and waited on by callers to PresentWaitKHR. */
    pthread_mutex_t                              present_progress_mutex;
    pthread_cond_t                               present_progress_cond;
-
-   /* Lock needs to be taken when waiting for and reading presentation events.
-    * Only relevant in non-FIFO modes where AcquireNextImage or WaitForPresentKHR may
-    * have to pump the XCB connection on its own. */
-   pthread_mutex_t                              present_poll_mutex;
-
-   /* For VK_KHR_present_wait. */
    uint64_t                                     present_id;
-   uint64_t                                     present_id_pending;
-
-   /* When blocking on present progress, this can be set and progress_cond is signalled to unblock waiters. */
    VkResult                                     present_progress_error;
 
-   /* For handling wait_ready scenario where two different threads can pump the connection. */
-
-   /* Updated by presentation thread. Incremented when a present is submitted to X.
-    * Signals progress_cond when this happens. */
-   uint64_t                                     present_submitted_count;
-   /* Total number of images ever pushed to a present queue. */
-   uint64_t                                     present_queue_push_count;
-   /* Total number of images returned to application in AcquireNextImage. */
-   uint64_t                                     present_poll_acquire_count;
-
    struct x11_image                             images[0];
 };
 VK_DEFINE_NONDISP_HANDLE_CASTS(x11_swapchain, base.base, VkSwapchainKHR,
@@ -1128,36 +1107,19 @@ static void x11_present_complete(struct x11_swapchain *swapchain,
 static void x11_notify_pending_present(struct x11_swapchain *swapchain,
                                        struct x11_image *image)
 {
-   if (image->present_id || !swapchain->has_acquire_queue) {
-      pthread_mutex_lock(&swapchain->present_progress_mutex);
-      if (image->present_id > swapchain->present_id_pending) {
-         /* Unblock any thread waiting for a presentID out of order. */
-         swapchain->present_id_pending = image->present_id;
-      }
-
-      /* If we don't have an acquire queue, we might need to let
-       * vkAcquireNextImageKHR call know that it is safe to poll for presentation events. */
-      swapchain->present_submitted_count++;
-
-      pthread_cond_broadcast(&swapchain->present_progress_cond);
-      pthread_mutex_unlock(&swapchain->present_progress_mutex);
-   }
-
-   /* It is possible that an IDLE is observed before PRESENT_COMPLETE when
-    * not flipping. In this case, reading image->present_id might be a race
-    * in the FIFO management thread. */
-   if (image->present_id)
-      image->signal_present_id = image->present_id;
+   image->signal_present_id = image->present_id;
+   pthread_cond_signal(&swapchain->thread_state_cond);
 }
 
+/* It is assumed that thread_state_lock is taken when calling this function. */
 static void x11_swapchain_notify_error(struct x11_swapchain *swapchain, VkResult result)
 {
    pthread_mutex_lock(&swapchain->present_progress_mutex);
    swapchain->present_id = UINT64_MAX;
-   swapchain->present_id_pending = UINT64_MAX;
    swapchain->present_progress_error = result;
    pthread_cond_broadcast(&swapchain->present_progress_cond);
    pthread_mutex_unlock(&swapchain->present_progress_mutex);
+   pthread_cond_broadcast(&swapchain->thread_state_cond);
 }
 
 /**
@@ -1168,6 +1130,8 @@ static void x11_swapchain_notify_error(struct x11_swapchain *swapchain, VkResult
  * We make sure to 'stick' more pessimistic statuses: an out-of-date error
  * is permanent once seen, and every subsequent call will return this. If
  * this has not been seen, success will be returned.
+ *
+ * It is assumed that thread_state_lock is taken when calling this function.
  */
 static VkResult
 _x11_swapchain_result(struct x11_swapchain *chain, VkResult result,
@@ -1256,11 +1220,9 @@ x11_handle_dri3_present_event(struct x11_swapchain *chain,
 
       for (unsigned i = 0; i < chain->base.image_count; i++) {
          if (chain->images[i].pixmap == idle->pixmap) {
-            chain->images[i].busy = false;
             chain->sent_image_count--;
             assert(chain->sent_image_count >= 0);
-            if (chain->has_acquire_queue)
-               wsi_queue_push(&chain->acquire_queue, i);
+            wsi_queue_push(&chain->acquire_queue, i);
             break;
          }
       }
@@ -1277,6 +1239,7 @@ x11_handle_dri3_present_event(struct x11_swapchain *chain,
             if (image->present_queued && image->serial == complete->serial) {
                x11_present_complete(chain, &chain->images[i]);
                image->present_queued = false;
+               pthread_cond_signal(&chain->thread_state_cond);
             }
          }
          chain->last_present_msc = complete->msc;
@@ -1329,284 +1292,12 @@ x11_handle_dri3_present_event(struct x11_swapchain *chain,
    return VK_SUCCESS;
 }
 
-static VkResult
-x11_poll_for_special_event(struct x11_swapchain *chain, uint64_t abs_timeout, xcb_generic_event_t **out_event)
-{
-   /* Start out with 1 ms intervals since that's what poll() supports. */
-   uint64_t poll_busywait_ns = 1000 * 1000;
-   xcb_generic_event_t *event;
-   uint64_t rel_timeout;
-   struct pollfd pfds;
-
-   assert(abs_timeout != UINT64_MAX);
-
-   /* abs_timeout is assumed to be in timebase of os_time_get_absolute_timeout(). */
-
-   /* See comments in x11_manage_fifo_queues about problems with xcb_poll followed by poll().
-    * This path is suboptimal for scenarios where we're doing:
-    * - IMMEDIATE / MAILBOX (no acquire queue) and
-    * - Timeout that is neither 0 nor UINT64_MAX (very rare).
-    * The only real solution is a busy-poll scheme to ensure we don't sleep for too long.
-    * In a forward progress scenario, the XCB FD will be written at least once per frame,
-    * so we expect frequent wake-ups either way.
-    * This is a best-effort pragmatic solution until we have a proper solution in XCB.
-    */
-
-   rel_timeout = abs_timeout;
-   *out_event = NULL;
-   event = NULL;
-
-   while (1) {
-      event = xcb_poll_for_special_event(chain->conn, chain->special_event);
-
-      if (event || rel_timeout == 0)
-         break;
-
-      /* If a non-special event happens, the fd will still
-       * poll. So recalculate the timeout now just in case.
-       */
-      uint64_t current_time = os_time_get_nano();
-      if (abs_timeout > current_time)
-         rel_timeout = MIN2(poll_busywait_ns, abs_timeout - current_time);
-      else
-         rel_timeout = 0;
-
-      if (rel_timeout) {
-         pfds.fd = xcb_get_file_descriptor(chain->conn);
-         pfds.events = POLLIN;
-         int ret = poll(&pfds, 1, MAX2(rel_timeout / 1000 / 1000, 1u));
-         if (ret == -1)
-            return VK_ERROR_OUT_OF_DATE_KHR;
-
-         /* Gradually increase the poll duration if it takes a very long time to receive a poll event,
-          * since at that point, stutter isn't really the main concern anymore.
-          * We generally expect a special event to be received once every refresh duration. */
-         poll_busywait_ns += poll_busywait_ns / 2;
-         poll_busywait_ns = MIN2(10ull * 1000ull * 1000ull, poll_busywait_ns);
-      }
-   }
-
-   *out_event = event;
-   return event ? VK_SUCCESS : VK_TIMEOUT;
-}
-
-static bool
-x11_acquire_next_image_poll_has_forward_progress(struct x11_swapchain *chain)
-{
-   /* We have forward progress in the sense that we just error out. */
-   if (chain->present_progress_error != VK_SUCCESS)
-      return true;
-
-   /* If we got here, there are no available images.
-    * Some images might be acquired, but not submitted.
-    * Some images might be submitted to FIFO thread, but not submitted to X yet. */
-
-   /* If application holds on to images without presenting, it affects forward progress.
-    * If application holds on to too many images, forward progress may be impossible.
-    * Application is allowed to call acquire with timeout in these scenarios, but not UINT64_MAX, since it may deadlock. */
-   assert(chain->present_poll_acquire_count >= chain->present_queue_push_count);
-   unsigned application_owned_images = chain->present_poll_acquire_count - chain->present_queue_push_count;
-   assert(application_owned_images <= chain->base.image_count);
-
-   const unsigned minimum_images = 2;
-
-   /* To observe an IDLE event, we must have submitted at least 2 present requests to X.
-    * The first present may replace another swapchain's image, but it cannot IDLE one of our own.
-    * Refuse forward progress until we have observed two completed present requests.
-    * If we are in a steady state, we only need one present to be able to idle the current image.
-    * In a blit style composition (windowed mode), images may be idled immediately, so this requirement is relaxed,
-    * but we have to assume the worst case of FLIP model where the front buffer holds on to one of the swapchain images. */
-   if (chain->present_submitted_count < minimum_images)
-      return false;
-
-   /* Since there are no available images, all images not owned by application have been pushed to FIFO thread.
-    * There must be at least 2 presents queued up. */
-   unsigned present_queued_images = chain->base.image_count - application_owned_images;
-   if (present_queued_images < minimum_images)
-      return false;
-
-   /* Present queue must have caught up. */
-   return (chain->present_queue_push_count - chain->present_submitted_count) <=
-          (present_queued_images - minimum_images);
-}
-
-static VkResult
-x11_acquire_next_image_poll_find_index(struct x11_swapchain *chain, uint32_t *image_index)
-{
-   /* We don't need a lock here. AcquireNextImageKHR cannot be called concurrently,
-   * and busy flag is atomic. */
-   for (uint32_t i = 0; i < chain->base.image_count; i++) {
-      if (!chain->images[i].busy) {
-         /* We found a non-busy image */
-         xshmfence_await(chain->images[i].shm_fence);
-         *image_index = i;
-         chain->images[i].busy = true;
-         chain->present_poll_acquire_count++;
-         return x11_swapchain_result(chain, VK_SUCCESS);
-      }
-   }
-
-   return x11_swapchain_result(chain, VK_NOT_READY);
-}
-
-/**
- * Acquire a ready-to-use image directly from our swapchain. If all images are
- * busy wait until one is not anymore or till timeout.
- */
-static VkResult
-x11_acquire_next_image_poll_x11(struct x11_swapchain *chain,
-                                uint32_t *image_index, uint64_t timeout)
-{
-   struct timespec rel_timeout, abs_timespec_realtime, start_time;
-   xcb_generic_event_t *event;
-   VkResult result;
-
-   /* If another thread is pumping the event queue, and we're polling with timeout == 0,
-    * try a quick poll before we try to take any locks. */
-   result = x11_acquire_next_image_poll_find_index(chain, image_index);
-   if (result != VK_NOT_READY)
-      return result;
-
-   uint64_t atimeout;
-   if (timeout == 0 || timeout == UINT64_MAX)
-      atimeout = timeout;
-   else
-      atimeout = os_time_get_absolute_timeout(timeout);
-
-   /* Mutex abs_timeout is in REALTIME timebase. */
-   timespec_from_nsec(&rel_timeout, timeout);
-   clock_gettime(CLOCK_REALTIME, &start_time);
-   timespec_add(&abs_timespec_realtime, &rel_timeout, &start_time);
-
-   if (chain->has_present_queue) {
-      /* If we have a present queue (but no acquire queue),
-       * we might need the present queue to complete
-       * a request before we can guarantee forward progress in the poll loop below.
-       * We take the poll_mutex, but so does the present queue. */
-      pthread_mutex_lock(&chain->present_progress_mutex);
-
-      /* There must be at least one present in-flight that has been committed to X,
-       * otherwise we can never satisfy the acquire operation if all images are busy,
-       * since we would be waiting on an event that will never happen. */
-      struct timespec abs_timespec;
-      timespec_from_nsec(&abs_timespec, atimeout);
-      result = VK_SUCCESS;
-
-      while (!x11_acquire_next_image_poll_has_forward_progress(chain)) {
-         int ret = pthread_cond_timedwait(&chain->present_progress_cond, &chain->present_progress_mutex, &abs_timespec);
-
-         if (ret == ETIMEDOUT) {
-            result = x11_swapchain_result(chain, timeout == 0 ? VK_NOT_READY : VK_TIMEOUT);
-            break;
-         }
-
-         if (ret) {
-            result = VK_ERROR_DEVICE_LOST;
-            break;
-         }
-      }
-
-      if (result == VK_SUCCESS && chain->present_progress_error != VK_SUCCESS)
-         result = chain->present_progress_error;
-
-      pthread_mutex_unlock(&chain->present_progress_mutex);
-
-      if (result != VK_SUCCESS)
-         return result;
-   }
-
-   int ret;
-   if (timeout == UINT64_MAX)
-      ret = pthread_mutex_lock(&chain->present_poll_mutex);
-   else
-      ret = pthread_mutex_timedlock(&chain->present_poll_mutex, &abs_timespec_realtime);
-
-   if (ret) {
-      if (ret == ETIMEDOUT)
-         return timeout == 0 ? VK_NOT_READY : VK_TIMEOUT;
-      else
-         return VK_ERROR_DEVICE_LOST;
-   }
-
-   while (1) {
-      result = x11_acquire_next_image_poll_find_index(chain, image_index);
-      if (result != VK_NOT_READY)
-         goto out_unlock;
-
-      xcb_flush(chain->conn);
-
-      if (timeout == UINT64_MAX) {
-         /* See comments in x11_manage_fifo_queues about problem scenarios with this call. */
-         event = xcb_wait_for_special_event(chain->conn, chain->special_event);
-         if (!event) {
-            result = x11_swapchain_result(chain, VK_ERROR_SURFACE_LOST_KHR);
-            goto out_unlock;
-         }
-      } else {
-         result = x11_poll_for_special_event(chain, atimeout, &event);
-         if (result == VK_TIMEOUT) {
-            /* AcquireNextImageKHR reserves a special return value for 0 timeouts. */
-            result = x11_swapchain_result(chain, timeout == 0 ? VK_NOT_READY : VK_TIMEOUT);
-            goto out_unlock;
-         } else if (result != VK_SUCCESS) {
-            result = x11_swapchain_result(chain, result);
-            goto out_unlock;
-         }
-      }
-
-      /* Update the swapchain status here. We may catch non-fatal errors here,
-       * in which case we need to update the status and continue.
-       */
-      result = x11_handle_dri3_present_event(chain, (void *)event);
-      /* Ensure that VK_SUBOPTIMAL_KHR is reported to the application */
-      result = x11_swapchain_result(chain, result);
-      free(event);
-      if (result < 0)
-         goto out_unlock;
-   }
-
-out_unlock:
-   pthread_mutex_unlock(&chain->present_poll_mutex);
-   return result;
-}
-
-/**
- * Acquire a ready-to-use image from the acquire-queue. Only relevant in fifo
- * presentation mode.
- */
-static VkResult
-x11_acquire_next_image_from_queue(struct x11_swapchain *chain,
-                                  uint32_t *image_index_out, uint64_t timeout)
-{
-   assert(chain->has_acquire_queue);
-
-   uint32_t image_index;
-   VkResult result = wsi_queue_pull(&chain->acquire_queue,
-                                    &image_index, timeout);
-   if (result < 0 || result == VK_TIMEOUT) {
-      /* On error, the thread has shut down, so safe to update chain->status.
-       * Calling x11_swapchain_result with VK_TIMEOUT won't modify
-       * chain->status so that is also safe.
-       */
-      return x11_swapchain_result(chain, result);
-   } else if (chain->status < 0) {
-      return chain->status;
-   }
-
-   assert(image_index < chain->base.image_count);
-   xshmfence_await(chain->images[image_index].shm_fence);
-
-   *image_index_out = image_index;
-
-   return chain->status;
-}
-
 /**
  * Send image to X server via Present extension.
  */
 static VkResult
 x11_present_to_x11_dri3(struct x11_swapchain *chain, uint32_t image_index,
-                        uint64_t target_msc)
+                        uint64_t target_msc, VkPresentModeKHR present_mode)
 {
    struct x11_image *image = &chain->images[image_index];
 
@@ -1622,13 +1313,13 @@ x11_present_to_x11_dri3(struct x11_swapchain *chain, uint32_t image_index,
    if (!wsi_conn)
       return VK_ERROR_OUT_OF_HOST_MEMORY;
 
-   if (chain->base.present_mode == VK_PRESENT_MODE_IMMEDIATE_KHR ||
-       (chain->base.present_mode == VK_PRESENT_MODE_MAILBOX_KHR &&
+   if (present_mode == VK_PRESENT_MODE_IMMEDIATE_KHR ||
+       (present_mode == VK_PRESENT_MODE_MAILBOX_KHR &&
         wsi_conn->is_xwayland) ||
-       chain->base.present_mode == VK_PRESENT_MODE_FIFO_RELAXED_KHR)
+       present_mode == VK_PRESENT_MODE_FIFO_RELAXED_KHR)
       options |= XCB_PRESENT_OPTION_ASYNC;
 
-   if (chain->base.present_mode == VK_PRESENT_MODE_IMMEDIATE_KHR
+   if (present_mode == VK_PRESENT_MODE_IMMEDIATE_KHR
       && chain->has_async_may_tear)
       options |= XCB_PRESENT_OPTION_ASYNC_MAY_TEAR;
 
@@ -1674,6 +1365,10 @@ x11_present_to_x11_sw(struct x11_swapchain *chain, uint32_t image_index,
 {
    struct x11_image *image = &chain->images[image_index];
 
+   /* Begin querying this before submitting the frame for improved async performance.
+    * In this _sw() mode we're expecting network round-trip delay, not just UNIX socket delay. */
+   xcb_get_geometry_cookie_t geom_cookie = xcb_get_geometry(chain->conn, chain->window);
+
    xcb_void_cookie_t cookie;
    void *myptr = image->base.cpu_map;
    size_t hdr_len = sizeof(xcb_put_image_request_t);
@@ -1711,9 +1406,27 @@ x11_present_to_x11_sw(struct x11_swapchain *chain, uint32_t image_index,
       }
    }
 
-   chain->images[image_index].busy = false;
    xcb_flush(chain->conn);
-   return x11_swapchain_result(chain, VK_SUCCESS);
+
+   /* We don't have queued present here.
+    * Immediately let application acquire again, but query geometry first so
+    * we can report SUBOPTIMAL on resize. */
+   xcb_generic_error_t *err;
+
+   xcb_get_geometry_reply_t *geom = xcb_get_geometry_reply(chain->conn, geom_cookie, &err);
+   VkResult result = VK_SUCCESS;
+   if (geom) {
+      if (chain->extent.width != geom->width ||
+          chain->extent.height != geom->height)
+         result = VK_SUBOPTIMAL_KHR;
+   } else {
+      result = VK_ERROR_SURFACE_LOST_KHR;
+   }
+   free(err);
+   free(geom);
+
+   wsi_queue_push(&chain->acquire_queue, image_index);
+   return result;
 }
 
 static void
@@ -1751,12 +1464,50 @@ x11_capture_trace(struct x11_swapchain *chain)
 #endif
 }
 
+static VkResult x11_swapchain_read_status_atomic(struct x11_swapchain *chain)
+{
+   pthread_mutex_lock(&chain->thread_state_lock);
+   VkResult status = chain->status;
+   pthread_mutex_unlock(&chain->thread_state_lock);
+   return status;
+}
+
+/**
+ * Decides if an early wait on buffer fences before buffer submission is required. That is for:
+ *   - Mailbox mode, as otherwise the latest image in the queue might not be fully rendered at
+ *     present time, which could lead to missing a frame.
+ *   - Immediate mode under Xwayland, as it works practically the same as mailbox mode using the
+ *     mailbox mechanism of Wayland. Sending a buffer with fences not yet signalled can make the
+ *     compositor miss a frame when compositing the final image with this buffer.
+ *
+ * Note though that early waits can be disabled in general on Xwayland by setting the
+ * 'vk_xwayland_wait_ready' DRIConf option to false.
+ */
+static bool
+x11_needs_wait_for_fences(const struct wsi_device *wsi_device,
+                          struct wsi_x11_connection *wsi_conn,
+                          VkPresentModeKHR present_mode)
+{
+   if (wsi_conn->is_xwayland && !wsi_device->x11.xwaylandWaitReady) {
+      return false;
+   }
+
+   switch (present_mode) {
+      case VK_PRESENT_MODE_MAILBOX_KHR:
+         return true;
+      case VK_PRESENT_MODE_IMMEDIATE_KHR:
+         return wsi_conn->is_xwayland;
+      default:
+         return false;
+   }
+}
+
 /**
  * Send image to the X server for presentation at target_msc.
  */
 static VkResult
 x11_present_to_x11(struct x11_swapchain *chain, uint32_t image_index,
-                   uint64_t target_msc)
+                   uint64_t target_msc, VkPresentModeKHR present_mode)
 {
    x11_capture_trace(chain);
 
@@ -1764,7 +1515,7 @@ x11_present_to_x11(struct x11_swapchain *chain, uint32_t image_index,
    if (chain->base.wsi->sw && !chain->has_mit_shm)
       result = x11_present_to_x11_sw(chain, image_index, target_msc);
    else
-      result = x11_present_to_x11_dri3(chain, image_index, target_msc);
+      result = x11_present_to_x11_dri3(chain, image_index, target_msc, present_mode);
 
    if (result < 0)
       x11_swapchain_notify_error(chain, result);
@@ -1785,18 +1536,7 @@ x11_release_images(struct wsi_swapchain *wsi_chain,
    for (uint32_t i = 0; i < count; i++) {
       uint32_t index = indices[i];
       assert(index < chain->base.image_count);
-
-      if (chain->has_acquire_queue) {
-         wsi_queue_push(&chain->acquire_queue, index);
-      } else {
-         assert(chain->images[index].busy);
-         chain->images[index].busy = false;
-      }
-   }
-
-   if (!chain->has_acquire_queue) {
-      assert(chain->present_poll_acquire_count >= count);
-      chain->present_poll_acquire_count -= count;
+      wsi_queue_push(&chain->acquire_queue, index);
    }
 
    return VK_SUCCESS;
@@ -1817,43 +1557,28 @@ x11_acquire_next_image(struct wsi_swapchain *anv_chain,
    uint64_t timeout = info->timeout;
 
    /* If the swapchain is in an error state, don't go any further. */
-   if (chain->status < 0)
-      return chain->status;
+   VkResult result = x11_swapchain_read_status_atomic(chain);
+   if (result < 0)
+      return result;
 
-   if (chain->base.wsi->sw && !chain->has_mit_shm) {
-      for (unsigned i = 0; i < chain->base.image_count; i++) {
-         if (!chain->images[i].busy) {
-            *image_index = i;
-            chain->images[i].busy = true;
-            chain->present_poll_acquire_count++;
-            xcb_generic_error_t *err;
+   result = wsi_queue_pull(&chain->acquire_queue,
+                           image_index, timeout);
 
-            xcb_get_geometry_cookie_t geom_cookie = xcb_get_geometry(chain->conn, chain->window);
-            xcb_get_geometry_reply_t *geom = xcb_get_geometry_reply(chain->conn, geom_cookie, &err);
-            VkResult result = VK_SUCCESS;
-            if (geom) {
-               struct wsi_device *wsi_device = (struct wsi_device *)chain->base.wsi;
-               if (!wsi_device->x11.ignore_suboptimal) {
-                  if (chain->extent.width != geom->width ||
-                      chain->extent.height != geom->height)
-                     result = VK_SUBOPTIMAL_KHR;
-               }
-            } else {
-               result = VK_ERROR_SURFACE_LOST_KHR;
-            }
-            free(err);
-            free(geom);
-            return result;
-         }
-      }
-      return VK_NOT_READY;
-   }
+   if (result == VK_TIMEOUT)
+      return info->timeout ? VK_TIMEOUT : VK_NOT_READY;
 
-   if (chain->has_acquire_queue) {
-      return x11_acquire_next_image_from_queue(chain, image_index, timeout);
-   } else {
-      return x11_acquire_next_image_poll_x11(chain, image_index, timeout);
-   }
+   pthread_mutex_lock(&chain->thread_state_lock);
+   result = x11_swapchain_result(chain, result);
+   pthread_mutex_unlock(&chain->thread_state_lock);
+
+   if (result < 0)
+      return result;
+
+   assert(*image_index < chain->base.image_count);
+   if (chain->images[*image_index].shm_fence)
+      xshmfence_await(chain->images[*image_index].shm_fence);
+
+   return result;
 }
 
 #define MAX_DAMAGE_RECTS 64
@@ -1875,8 +1600,9 @@ x11_queue_present(struct wsi_swapchain *anv_chain,
    xcb_xfixes_region_t update_area = 0;
 
    /* If the swapchain is in an error state, don't go any further. */
-   if (chain->status < 0)
-      return chain->status;
+   VkResult status = x11_swapchain_read_status_atomic(chain);
+   if (status < 0)
+      return status;
 
    if (damage && damage->pRectangles && damage->rectangleCount > 0 &&
       damage->rectangleCount <= MAX_DAMAGE_RECTS) {
@@ -1895,49 +1621,11 @@ x11_queue_present(struct wsi_swapchain *anv_chain,
    }
    chain->images[image_index].update_area = update_area;
    chain->images[image_index].present_id = present_id;
+   /* With EXT_swapchain_maintenance1, the present mode can change per present. */
+   chain->images[image_index].present_mode = chain->base.present_mode;
 
-   chain->images[image_index].busy = true;
-   if (chain->has_present_queue) {
-      wsi_queue_push(&chain->present_queue, image_index);
-      chain->present_queue_push_count++;
-      return chain->status;
-   } else {
-      /* No present queue means immedate mode, so we present immediately. */
-      pthread_mutex_lock(&chain->present_poll_mutex);
-      VkResult result = x11_present_to_x11(chain, image_index, 0);
-      pthread_mutex_unlock(&chain->present_poll_mutex);
-      return result;
-   }
-}
-
-/**
- * Decides if an early wait on buffer fences before buffer submission is required. That is for:
- *   - Mailbox mode, as otherwise the latest image in the queue might not be fully rendered at
- *     present time, what could lead to missing a frame.
- *   - Immediate mode under Xwayland, as it works practically the same as mailbox mode using the
- *     mailbox mechanism of Wayland. Sending a buffer with fences not yet signalled can make the
- *     compositor miss a frame when compositing the final image with this buffer.
- *
- * Note though that early waits can be disabled in general on Xwayland by setting the
- * 'vk_xwayland_wait_ready' DRIConf option to false.
- */
-static bool
-x11_needs_wait_for_fences(const struct wsi_device *wsi_device,
-                          struct wsi_x11_connection *wsi_conn,
-                          VkPresentModeKHR present_mode)
-{
-   if (wsi_conn->is_xwayland && !wsi_device->x11.xwaylandWaitReady) {
-      return false;
-   }
-
-   switch (present_mode) {
-   case VK_PRESENT_MODE_MAILBOX_KHR:
-      return true;
-   case VK_PRESENT_MODE_IMMEDIATE_KHR:
-      return wsi_conn->is_xwayland;
-   default:
-      return false;
-   }
+   wsi_queue_push(&chain->present_queue, image_index);
+   return x11_swapchain_read_status_atomic(chain);
 }
 
 /**
@@ -1951,38 +1639,116 @@ static unsigned x11_driver_owned_images(const struct x11_swapchain *chain)
    return chain->base.image_count - chain->sent_image_count;
 }
 
+/* This thread is responsible for pumping PRESENT replies.
+ * This is done in a separate thread from the X11 presentation thread
+ * to be able to support non-blocking modes like IMMEDIATE and MAILBOX.
+ * Frame completion events can happen at any time, and we need to handle
+ * the events as soon as they come in to have a quality implementation.
+ * The presentation thread may go to sleep waiting for new presentation events to come in,
+ * and it cannot wait for both X events and application events at the same time.
+ * If we only cared about FIFO, this thread wouldn't be very useful.
+ * Earlier implementation of X11 WSI had a single FIFO thread that blocked on X events after presenting.
+ * For IMMEDIATE and MAILBOX, the application thread pumped the event queue, which caused a lot of pain
+ * when trying to deal with present wait.
+ */
+static void *
+x11_manage_event_queue(void *state)
+{
+   struct x11_swapchain *chain = state;
+   u_thread_setname("WSI swapchain event");
+
+   /* While there is an outstanding IDLE we should wait for it.
+    * In FLIP modes at most one image will not be driver owned eventually.
+    * In BLIT modes, we expect that all images will eventually be driver owned,
+    * but we don't know which mode is being used. */
+   unsigned forward_progress_guaranteed_acquired_images = chain->base.image_count - 1;
+
+   pthread_mutex_lock(&chain->thread_state_lock);
+
+   while (chain->status >= 0) {
+      /* This thread should only go sleep waiting for X events when we know there are pending events.
+       * We expect COMPLETION events when there is at least one image marked as present_queued.
+       * We also expect IDLE events, but we only consider waiting for them when all images are busy,
+       * and application has fewer than N images acquired. */
+
+      bool assume_forward_progress = false;
+
+      for (uint32_t i = 0; i < chain->base.image_count; i++) {
+         if (chain->images[i].present_queued) {
+            /* We must pump through a present wait and unblock FIFO thread if using FIFO mode. */
+            assume_forward_progress = true;
+            break;
+         }
+      }
+
+      if (!assume_forward_progress) {
+         /* If true, application expects acquire (IDLE) to happen in finite time. */
+         assume_forward_progress = x11_driver_owned_images(chain) <
+                                   forward_progress_guaranteed_acquired_images;
+      }
+
+      if (assume_forward_progress) {
+         /* Only yield lock when blocking on X11 event. */
+         pthread_mutex_unlock(&chain->thread_state_lock);
+         xcb_generic_event_t *event =
+               xcb_wait_for_special_event(chain->conn, chain->special_event);
+         pthread_mutex_lock(&chain->thread_state_lock);
+
+         /* Re-check status since we dropped the lock while waiting for X. */
+         VkResult result = chain->status;
+
+         if (result >= 0) {
+            if (event) {
+               /* Queue thread will be woken up if anything interesting happened in handler.
+                * Queue thread blocks on:
+                * - Presentation events completing
+                * - Presentation requests from application
+                * - WaitForFence workaround if applicable */
+               result = x11_handle_dri3_present_event(chain, (void *) event);
+            } else {
+               result = VK_ERROR_SURFACE_LOST_KHR;
+            }
+         }
+
+         /* Updates chain->status and wakes up threads as necessary on error. */
+         x11_swapchain_result(chain, result);
+         free(event);
+      } else {
+         /* Nothing important to do, go to sleep until queue thread wakes us up. */
+         pthread_cond_wait(&chain->thread_state_cond, &chain->thread_state_lock);
+      }
+   }
+
+   pthread_mutex_unlock(&chain->thread_state_lock);
+   return NULL;
+}
+
 /**
- * Our queue manager. Albeit called x11_manage_fifo_queues only directly
- * manages the present-queue and does this in general in fifo and mailbox presentation
- * modes (there is no present-queue in immediate mode with the exception of Xwayland).
+ * Presentation thread.
  *
  * Runs in a separate thread, blocks and reacts to queued images on the
  * present-queue
  *
- * In mailbox mode the queue management is simplified since we only need to
- * pull new images from the present queue and can directly present them.
- *
- * In fifo mode images can only be presented one after the other. For that after
- * sending the image to the X server we wait until the image either has been
- * presented or released and only then pull a new image from the present-queue.
+ * This must be a thread since we have to block in two cases:
+ * - FIFO:
+ *     We must wait for previous presentation to complete
+ *     in some way so we can compute the target MSC.
+ * - WaitForFence workaround:
+ *     In some cases, we need to wait for image to complete rendering before submitting it to X.
  */
 static void *
-x11_manage_fifo_queues(void *state)
+x11_manage_present_queue(void *state)
 {
    struct x11_swapchain *chain = state;
    struct wsi_x11_connection *wsi_conn =
-      wsi_x11_get_connection((struct wsi_device*)chain->base.wsi, chain->conn);
+         wsi_x11_get_connection((struct wsi_device*)chain->base.wsi, chain->conn);
    VkResult result = VK_SUCCESS;
 
-   assert(chain->has_present_queue);
-
    u_thread_setname("WSI swapchain queue");
 
-   while (chain->status >= 0) {
-      /* We can block here unconditionally because after an image was sent to
-       * the server (later on in this loop) we ensure at least one image is
-       * acquirable by the consumer or wait there on such an event.
-       */
+   uint64_t target_msc = 0;
+
+   while (x11_swapchain_read_status_atomic(chain) >= 0) {
       uint32_t image_index = 0;
       {
          MESA_TRACE_SCOPE("pull present queue");
@@ -1990,117 +1756,70 @@ x11_manage_fifo_queues(void *state)
          assert(result != VK_TIMEOUT);
       }
 
-      if (result < 0) {
-         goto fail;
-      } else if (chain->status < 0) {
-         /* The status can change underneath us if the swapchain is destroyed
-          * from another thread.
-          */
-         return NULL;
-      }
+      /* The status can change underneath us if the swapchain is destroyed
+       * from another thread. */
+      if (result >= 0)
+         result = x11_swapchain_read_status_atomic(chain);
+      if (result < 0)
+         break;
+
+      VkPresentModeKHR present_mode = chain->images[image_index].present_mode;
 
-      /* Waiting for the GPU work to finish at this point in time is required in certain usage
-       * scenarios. Otherwise we wait as usual in wsi_common_queue_present.
-       */
       if (x11_needs_wait_for_fences(chain->base.wsi, wsi_conn,
-                                    chain->base.present_mode)) {
+                                    present_mode)) {
          MESA_TRACE_SCOPE("wait fence");
          result = chain->base.wsi->WaitForFences(chain->base.device, 1,
-                                        &chain->base.fences[image_index],
-                                        true, UINT64_MAX);
+                                                 &chain->base.fences[image_index],
+                                                 true, UINT64_MAX);
          if (result != VK_SUCCESS) {
             result = VK_ERROR_OUT_OF_DATE_KHR;
-            goto fail;
+            break;
          }
       }
 
-      uint64_t target_msc = 0;
-      if (chain->has_acquire_queue)
-         target_msc = chain->last_present_msc + 1;
+      pthread_mutex_lock(&chain->thread_state_lock);
 
-      /* Locking here is only relevant if we don't have an acquire queue.
-       * WaitForPresentKHR will pump the message queue on its own unless
-       * has_acquire_queue and has_present_queue are both true. */
-      if (!chain->has_acquire_queue)
-         pthread_mutex_lock(&chain->present_poll_mutex);
-      result = x11_present_to_x11(chain, image_index, target_msc);
-      if (!chain->has_acquire_queue)
-         pthread_mutex_unlock(&chain->present_poll_mutex);
+      /* In IMMEDIATE and MAILBOX modes, don't try to present the same image again
+       * until we have observed COMPLETE.
+       * That way we avoid overriding the pending signal_present_id. */
+      while (chain->status >= 0 && chain->images[image_index].present_queued) {
+         pthread_cond_wait(&chain->thread_state_cond, &chain->thread_state_lock);
+      }
 
-      if (result < 0)
-         goto fail;
+      if (chain->status < 0) {
+         pthread_mutex_unlock(&chain->thread_state_lock);
+         break;
+      }
 
-      if (chain->has_acquire_queue) {
+      result = x11_present_to_x11(chain, image_index, target_msc, present_mode);
+
+      if (result < 0) {
+         pthread_mutex_unlock(&chain->thread_state_lock);
+         break;
+      }
+
+      if (present_mode == VK_PRESENT_MODE_FIFO_KHR ||
+          present_mode == VK_PRESENT_MODE_FIFO_RELAXED_KHR) {
          MESA_TRACE_SCOPE("wait present");
 
-         /* Assume this isn't a swapchain where we force 5 images, because those
-          * don't end up with an acquire queue at the moment.
-          */
-         unsigned min_image_count = x11_get_min_image_count(chain->base.wsi, wsi_conn->is_xwayland);
-
-         /* With drirc overrides some games have swapchain with less than
-          * minimum number of images. */
-         min_image_count = MIN2(min_image_count, chain->base.image_count);
-
-         /* We always need to ensure that the app can have this number of images
-          * acquired concurrently in between presents:
-          * "VUID-vkAcquireNextImageKHR-swapchain-01802
-          *  If the number of currently acquired images is greater than the difference
-          *  between the number of images in swapchain and the value of
-          *  VkSurfaceCapabilitiesKHR::minImageCount as returned by a call to
-          *  vkGetPhysicalDeviceSurfaceCapabilities2KHR with the surface used to
-          *  create swapchain, timeout must not be UINT64_MAX"
-          */
-         unsigned forward_progress_guaranteed_acquired_images =
-            chain->base.image_count - min_image_count + 1;
-
-         /* Wait for our presentation to occur and ensure we have at least one
-          * image that can be acquired by the client afterwards. This ensures we
-          * can pull on the present-queue on the next loop.
-          */
-         while (chain->images[image_index].present_queued ||
-                /* If we have images in the present queue the outer loop won't block and a break
-                 * here would end up at this loop again, otherwise a break here satisfies
-                 * VUID-vkAcquireNextImageKHR-swapchain-01802 */
-                x11_driver_owned_images(chain) < forward_progress_guaranteed_acquired_images) {
-
-            /* Calls to xcb_wait_for_special_event are broken by design due to a XCB flaw.
-             * This call may hang indefinitely if the X window is destroyed before the swapchain.
-             * An X window destruction does not trigger a special event here, unfortunately.
-             *
-             * A workaround was attempted in
-             * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13564#note_1121977, but
-             * was reverted due to its high CPU usage.
-             * No pragmatic solution exists that solves CPU usage and stutter problems.
-             *
-             * A xcb_poll call followed by poll() is a race condition if other threads read from the XCB connection FD
-             * between the xcb poll and fd poll(), even if it's completely unrelated to this event queue.
-             * poll() may end up waiting indefinitely even if the XCB event has been moved from the FD
-             * to chain->special_event queue.
-             * The proper fix is a wait_for_special_event_with_timeout, but it does not exist.
-             * See https://gitlab.freedesktop.org/xorg/lib/libxcb/-/merge_requests/9.
-             * For now, keep this approach. Applications are generally well-behaved. */
-            xcb_generic_event_t *event =
-               xcb_wait_for_special_event(chain->conn, chain->special_event);
-            if (!event) {
-               result = VK_ERROR_SURFACE_LOST_KHR;
-               goto fail;
-            }
-
-            result = x11_handle_dri3_present_event(chain, (void *)event);
-            /* Ensure that VK_SUBOPTIMAL_KHR is reported to the application */
-            result = x11_swapchain_result(chain, result);
-            free(event);
-            if (result < 0)
-               goto fail;
+         while (chain->status >= 0 && chain->images[image_index].present_queued) {
+            /* In FIFO mode, we need to make sure we observe a COMPLETE before queueing up
+             * another present. */
+            pthread_cond_wait(&chain->thread_state_cond, &chain->thread_state_lock);
          }
+
+         /* If next present is not FIFO, we still need to ensure we don't override that
+          * present. If FIFO, we need to ensure MSC is larger than the COMPLETED frame. */
+         target_msc = chain->last_present_msc + 1;
       }
+
+      pthread_mutex_unlock(&chain->thread_state_lock);
    }
 
-fail:
+   pthread_mutex_lock(&chain->thread_state_lock);
    x11_swapchain_result(chain, result);
-   if (chain->has_acquire_queue)
-      wsi_queue_push(&chain->acquire_queue, UINT32_MAX);
+   wsi_queue_push(&chain->acquire_queue, UINT32_MAX);
+   pthread_mutex_unlock(&chain->thread_state_lock);
 
    return NULL;
 }
@@ -2150,7 +1869,6 @@ x11_image_init(VkDevice device_h, struct x11_swapchain *chain,
 
    if (chain->base.wsi->sw) {
       if (!chain->has_mit_shm) {
-         image->busy = false;
          return VK_SUCCESS;
       }
 
@@ -2252,7 +1970,6 @@ out_fence:
                           false,
                           fence_fd);
 
-   image->busy = false;
    xshmfence_trigger(image->shm_fence);
 
    return VK_SUCCESS;
@@ -2438,16 +2155,18 @@ x11_swapchain_destroy(struct wsi_swapchain *anv_chain,
    struct x11_swapchain *chain = (struct x11_swapchain *)anv_chain;
    xcb_void_cookie_t cookie;
 
-   if (chain->has_present_queue) {
-      chain->status = VK_ERROR_OUT_OF_DATE_KHR;
-      /* Push a UINT32_MAX to wake up the manager */
-      wsi_queue_push(&chain->present_queue, UINT32_MAX);
-      pthread_join(chain->queue_manager, NULL);
+   pthread_mutex_lock(&chain->thread_state_lock);
+   chain->status = VK_ERROR_OUT_OF_DATE_KHR;
+   pthread_cond_broadcast(&chain->thread_state_cond);
+   pthread_mutex_unlock(&chain->thread_state_lock);
 
-      if (chain->has_acquire_queue)
-         wsi_queue_destroy(&chain->acquire_queue);
-      wsi_queue_destroy(&chain->present_queue);
-   }
+   /* Push a UINT32_MAX to wake up the manager */
+   wsi_queue_push(&chain->present_queue, UINT32_MAX);
+   pthread_join(chain->queue_manager, NULL);
+   pthread_join(chain->event_manager, NULL);
+
+   wsi_queue_destroy(&chain->acquire_queue);
+   wsi_queue_destroy(&chain->present_queue);
 
    for (uint32_t i = 0; i < chain->base.image_count; i++)
       x11_image_finish(chain, pAllocator, &chain->images[i]);
@@ -2458,9 +2177,10 @@ x11_swapchain_destroy(struct wsi_swapchain *anv_chain,
                                              XCB_PRESENT_EVENT_MASK_NO_EVENT);
    xcb_discard_reply(chain->conn, cookie.sequence);
 
-   pthread_mutex_destroy(&chain->present_poll_mutex);
    pthread_mutex_destroy(&chain->present_progress_mutex);
    pthread_cond_destroy(&chain->present_progress_cond);
+   pthread_mutex_destroy(&chain->thread_state_lock);
+   pthread_cond_destroy(&chain->thread_state_cond);
 
    wsi_swapchain_finish(&chain->base);
 
@@ -2495,10 +2215,11 @@ wsi_x11_set_adaptive_sync_property(xcb_connection_t *conn,
    free(reply);
 }
 
-static VkResult x11_wait_for_present_queued(
-      struct x11_swapchain *chain,
-      uint64_t waitValue, uint64_t timeout)
+static VkResult x11_wait_for_present(struct wsi_swapchain *wsi_chain,
+                                     uint64_t waitValue,
+                                     uint64_t timeout)
 {
+   struct x11_swapchain *chain = (struct x11_swapchain *)wsi_chain;
    struct timespec abs_timespec;
    uint64_t abs_timeout = 0;
    if (timeout != 0)
@@ -2507,7 +2228,7 @@ static VkResult x11_wait_for_present_queued(
    /* Need to observe that the swapchain semaphore has been unsignalled,
     * as this is guaranteed when a present is complete. */
    VkResult result = wsi_swapchain_wait_for_present_semaphore(
-      &chain->base, waitValue, timeout);
+         &chain->base, waitValue, timeout);
    if (result != VK_SUCCESS)
       return result;
 
@@ -2533,138 +2254,6 @@ static VkResult x11_wait_for_present_queued(
    return result;
 }
 
-static VkResult x11_wait_for_present_polled(
-      struct x11_swapchain *chain,
-      uint64_t waitValue, uint64_t timeout)
-{
-   struct timespec rel_timeout, abs_timespec_realtime, start_time;
-   struct timespec abs_timespec_monotonic;
-   uint64_t abs_timeout_monotonic = 0;
-
-   if (timeout != 0)
-      abs_timeout_monotonic = os_time_get_absolute_timeout(timeout);
-
-   /* Mutex abs_timeout is in REALTIME timebase. */
-   timespec_from_nsec(&rel_timeout, timeout);
-   clock_gettime(CLOCK_REALTIME, &start_time);
-   timespec_add(&abs_timespec_realtime, &rel_timeout, &start_time);
-
-   /* Need to observe that the swapchain semaphore has been unsignalled,
-    * as this is guaranteed when a present is complete. */
-   VkResult result = wsi_swapchain_wait_for_present_semaphore(
-      &chain->base, waitValue, timeout);
-   if (result != VK_SUCCESS)
-      return result;
-
-   /* If we have satisfied the present ID right away, just return early. */
-   pthread_mutex_lock(&chain->present_progress_mutex);
-   if (chain->present_id >= waitValue) {
-      result = chain->present_progress_error;
-   } else {
-      result = VK_TIMEOUT;
-   }
-
-   if (result != VK_TIMEOUT) {
-      pthread_mutex_unlock(&chain->present_progress_mutex);
-      return result;
-   }
-
-   timespec_from_nsec(&abs_timespec_monotonic, abs_timeout_monotonic);
-
-   /* In a situation of wait-before-signal, we need to ensure that a presentID of at least
-    * waitValue has been submitted before we're allowed to lock the XCB connection.
-    * Even if the user does not use wait-before-signal we can still hit this scenario on Xwayland
-    * where we have a present queue, but no acquire queue. We need to observe that the present queue
-    * has actually submitted the present to XCB before we're guaranteed forward progress. */
-   while (chain->present_id_pending < waitValue) {
-      int ret = pthread_cond_timedwait(&chain->present_progress_cond,
-                                       &chain->present_progress_mutex,
-                                       &abs_timespec_monotonic);
-      if (chain->present_progress_error || ret == ETIMEDOUT || ret) {
-         pthread_mutex_unlock(&chain->present_progress_mutex);
-
-         if (chain->present_progress_error)
-            return chain->present_progress_error;
-         else if (ret == ETIMEDOUT)
-            return VK_TIMEOUT;
-         else
-            return VK_ERROR_DEVICE_LOST;
-      }
-   }
-   pthread_mutex_unlock(&chain->present_progress_mutex);
-
-   /* This scheme of taking the message queue lock is not optimal,
-    * but it is only problematic in meaningless situations.
-    * - This path can only be hit by IMMEDIATE or MAILBOX mode.
-    *   Using present wait for IMMEDIATE and MAILBOX is not particularly useful except
-    *   for safe teardown purposes and recycling semaphores.
-    * - There is contention with multiple threads waiting for PresentWait,
-    *   where the first thread to wait is blocking with no timeout and hogs the message queue until
-    *   that present is processed. */
-   int ret;
-   if (timeout == UINT64_MAX)
-      ret = pthread_mutex_lock(&chain->present_poll_mutex);
-   else
-      ret = pthread_mutex_timedlock(&chain->present_poll_mutex, &abs_timespec_realtime);
-
-   if (ret) {
-      if (ret == ETIMEDOUT)
-         return VK_TIMEOUT;
-      else
-         return VK_ERROR_DEVICE_LOST;
-   }
-
-   result = chain->present_progress_error;
-
-   while (result == VK_SUCCESS && chain->present_id < waitValue) {
-      xcb_generic_event_t *event;
-      xcb_flush(chain->conn);
-
-      if (timeout == UINT64_MAX) {
-         /* See comments in x11_manage_fifo_queues about problem scenarios with this call. */
-         event = xcb_wait_for_special_event(chain->conn, chain->special_event);
-         if (!event) {
-            result = x11_swapchain_result(chain, VK_ERROR_SURFACE_LOST_KHR);
-            goto fail;
-         }
-      } else {
-         result = x11_poll_for_special_event(chain, abs_timeout_monotonic, &event);
-         if (result != VK_SUCCESS)
-            goto fail;
-      }
-
-      result = x11_handle_dri3_present_event(chain, (void *)event);
-      /* Ensure that VK_SUBOPTIMAL_KHR is reported to the application */
-      result = x11_swapchain_result(chain, result);
-      free(event);
-   }
-
-fail:
-   pthread_mutex_unlock(&chain->present_poll_mutex);
-   return result;
-}
-
-static VkResult x11_wait_for_present(struct wsi_swapchain *wsi_chain,
-                                     uint64_t waitValue,
-                                     uint64_t timeout)
-{
-   struct x11_swapchain *chain = (struct x11_swapchain *)wsi_chain;
-   VkResult result;
-
-   if (chain->has_present_queue && chain->has_acquire_queue) {
-      /* In this style we have guaranteed forward progress in the present queue thread,
-       * so we don't need to do anything.
-       * This path is hit for FIFO presentation modes. */
-      result = x11_wait_for_present_queued(chain, waitValue, timeout);
-   } else {
-      /* In this style we don't necessarily have forward progress, so we need to pump the message queue ourselves.
-       * This blocks the message queue for other threads that want to present.
-       * In practice, we'll only end up blocking on swapchain teardown, so this isn't a big deal. */
-      result = x11_wait_for_present_polled(chain, waitValue, timeout);
-   }
-   return result;
-}
-
 static unsigned
 x11_get_min_image_count_for_present_mode(struct wsi_device *wsi_device,
                                          struct wsi_x11_connection *wsi_conn,
@@ -2747,17 +2336,26 @@ x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
       return VK_ERROR_OUT_OF_HOST_MEMORY;
    }
 
-   ret = pthread_mutex_init(&chain->present_poll_mutex, NULL);
+   ret = pthread_mutex_init(&chain->thread_state_lock, NULL);
    if (ret != 0) {
       pthread_mutex_destroy(&chain->present_progress_mutex);
       vk_free(pAllocator, chain);
       return VK_ERROR_OUT_OF_HOST_MEMORY;
    }
 
+   ret = pthread_cond_init(&chain->thread_state_cond, NULL);
+   if (ret != 0) {
+      pthread_mutex_destroy(&chain->present_progress_mutex);
+      pthread_mutex_destroy(&chain->thread_state_lock);
+      vk_free(pAllocator, chain);
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+   }
+
    bool bret = wsi_init_pthread_cond_monotonic(&chain->present_progress_cond);
    if (!bret) {
       pthread_mutex_destroy(&chain->present_progress_mutex);
-      pthread_mutex_destroy(&chain->present_poll_mutex);
+      pthread_mutex_destroy(&chain->thread_state_lock);
+      pthread_cond_destroy(&chain->thread_state_cond);
       vk_free(pAllocator, chain);
       return VK_ERROR_OUT_OF_HOST_MEMORY;
    }
@@ -2815,8 +2413,6 @@ x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
    chain->send_sbc = 0;
    chain->sent_image_count = 0;
    chain->last_present_msc = 0;
-   chain->has_acquire_queue = false;
-   chain->has_present_queue = false;
    chain->status = VK_SUCCESS;
    chain->has_dri3_modifiers = wsi_conn->has_dri3_modifiers;
    chain->has_mit_shm = wsi_conn->has_mit_shm;
@@ -2901,60 +2497,33 @@ x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
          goto fail_init_images;
    }
 
-   /* Initialize queues for images in our swapchain. Possible queues are:
-    * - Present queue: for images sent to the X server but not yet presented.
-    * - Acquire queue: for images already presented but not yet released by the
-    *                  X server.
-    *
-    * In general queues are not used on software drivers, otherwise which queues
-    * are used depends on our presentation mode:
-    * - Fifo: present and acquire
-    * - Mailbox: present only
-    * - Immediate: present when we wait on fences before buffer submission (Xwayland)
+   /* The queues have a length of base.image_count + 1 because we will
+    * occasionally use UINT32_MAX to signal the other thread that an error
+    * has occurred and we don't want an overflow.
     */
-   if ((chain->base.present_mode == VK_PRESENT_MODE_FIFO_KHR ||
-        chain->base.present_mode == VK_PRESENT_MODE_FIFO_RELAXED_KHR ||
-        x11_needs_wait_for_fences(wsi_device, wsi_conn,
-                                  chain->base.present_mode)) &&
-       !chain->base.wsi->sw) {
-      chain->has_present_queue = true;
-
-      /* The queues have a length of base.image_count + 1 because we will
-       * occasionally use UINT32_MAX to signal the other thread that an error
-       * has occurred and we don't want an overflow.
-       */
-      int ret;
-      ret = wsi_queue_init(&chain->present_queue, chain->base.image_count + 1);
-      if (ret) {
-         goto fail_init_images;
-      }
-
-      if (chain->base.present_mode == VK_PRESENT_MODE_FIFO_KHR ||
-          chain->base.present_mode == VK_PRESENT_MODE_FIFO_RELAXED_KHR) {
-         chain->has_acquire_queue = true;
-
-         ret = wsi_queue_init(&chain->acquire_queue, chain->base.image_count + 1);
-         if (ret) {
-            wsi_queue_destroy(&chain->present_queue);
-            goto fail_init_images;
-         }
-
-         for (unsigned i = 0; i < chain->base.image_count; i++)
-            wsi_queue_push(&chain->acquire_queue, i);
-      }
-
-      ret = pthread_create(&chain->queue_manager, NULL,
-                           x11_manage_fifo_queues, chain);
-      if (ret) {
-         wsi_queue_destroy(&chain->present_queue);
-         if (chain->has_acquire_queue)
-            wsi_queue_destroy(&chain->acquire_queue);
-
-         goto fail_init_images;
-      }
+   ret = wsi_queue_init(&chain->present_queue, chain->base.image_count + 1);
+   if (ret) {
+      goto fail_init_images;
    }
 
-   assert(chain->has_present_queue || !chain->has_acquire_queue);
+   ret = wsi_queue_init(&chain->acquire_queue, chain->base.image_count + 1);
+   if (ret) {
+      wsi_queue_destroy(&chain->present_queue);
+      goto fail_init_images;
+   }
+
+   for (unsigned i = 0; i < chain->base.image_count; i++)
+      wsi_queue_push(&chain->acquire_queue, i);
+
+   ret = pthread_create(&chain->queue_manager, NULL,
+                        x11_manage_present_queue, chain);
+   if (ret)
+      goto fail_init_fifo_queue;
+
+   ret = pthread_create(&chain->event_manager, NULL,
+                        x11_manage_event_queue, chain);
+   if (ret)
+      goto fail_init_event_queue;
 
    /* It is safe to set it here as only one swapchain can be associated with
     * the window, and swapchain creation does the association. At this point
@@ -2966,6 +2535,15 @@ x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
 
    return VK_SUCCESS;
 
+fail_init_event_queue:
+   /* Push a UINT32_MAX to wake up the manager */
+   wsi_queue_push(&chain->present_queue, UINT32_MAX);
+   pthread_join(chain->queue_manager, NULL);
+
+fail_init_fifo_queue:
+   wsi_queue_destroy(&chain->present_queue);
+   wsi_queue_destroy(&chain->acquire_queue);
+
 fail_init_images:
    for (uint32_t j = 0; j < image; j++)
       x11_image_finish(chain, pAllocator, &chain->images[j]);