Block layer patches

- stream: Fix potential crash during job completion - aio: add the aio_add_sqe() io_uring API - qcow2: put discards in discard queue when discard-no-unref is enabled - qcow2, vmdk: Restrict creation with secondary file using protocol - qemu-img rebase: Fix assertion failure due to exceeding IO_BUF_SIZE - iotests: Run iotests with sanitizers - iotests: Add more image formats to the thorough testing - iotests: Improve the dry run list to speed up thorough testing - Code cleanup -----BEGIN PGP SIGNATURE----- iQJFBAABCgAvFiEE3D3rFZqa+V09dFb+fwmycsiPL9YFAmkTqWcRHGt3b2xmQHJl ZGhhdC5jb20ACgkQfwmycsiPL9awPg//VqEgqYbEr3dVUvBFk8tlcewoo7KGICVk 4kddOwMJIdcsVpiLuNzqQARH2kHV93Hiv+mVt25o00PkJx565eCGTh/bBFas3UXL JMBjgHyJutGr4cijkNrnQgqWfeTgc32xdVEWh1nZM2K7LslzC9I1PfUzfxRMYqZA Em0KE3vwQDC7xtIyk4t451hkfcQY8fwN9bDMpD+zbzaLsYTEyOJ900En88iW7oHE TuJhrviin11jdQCA26QVNXRaw7iIVVo8vJP1VEgbn31iY+Qpcr/HcQRs0x2gex67 OqIdh4onqkdGCFDxTGUoAH+jORXWUmk/JipIhl9pJP0ZDyAjsm97ThJ6SvctURsK UMU0dzXEc1C5spD2CWnN0PujqHYQqYaylx7MdiCJMjaCfDB3ZeIRsTGoiLMB24P+ WBrcn2P+f03nC/sVvxRZWrpyI2kZwEh1RsO/mnLQ3apVBFeKqaFi8Ouo9oi1ZMd6 ahUw7sZSoTxmGY1FhOSRCGEh2Wjy0ZIOx9tHT1U9vig5Kf9KeE81yO8yaq2T60mq 9eaUL8rcUrKRiJw9NUkcEYmIUJrh0nUe/kK2RWmbEGMYIH7ASrGqiyUP5FxpekD+ i/uen4BeyRwe6rnPOzGolg+HMysMBr8VD/8PwJ8g88FLH1jIdTYvFUdRbrkciUlo okC+y4+kqiU= =SI8s -----END PGP SIGNATURE----- Merge tag 'for-upstream' of https://repo.or.cz/qemu/kevin into staging Block layer patches - stream: Fix potential crash during job completion - aio: add the aio_add_sqe() io_uring API - qcow2: put discards in discard queue when discard-no-unref is enabled - qcow2, vmdk: Restrict creation with secondary file using protocol - qemu-img rebase: Fix assertion failure due to exceeding IO_BUF_SIZE - iotests: Run iotests with sanitizers - iotests: Add more image formats to the thorough testing - iotests: Improve the dry run list to speed up thorough testing - Code cleanup # -----BEGIN PGP SIGNATURE----- # # iQJFBAABCgAvFiEE3D3rFZqa+V09dFb+fwmycsiPL9YFAmkTqWcRHGt3b2xmQHJl # ZGhhdC5jb20ACgkQfwmycsiPL9awPg//VqEgqYbEr3dVUvBFk8tlcewoo7KGICVk # 4kddOwMJIdcsVpiLuNzqQARH2kHV93Hiv+mVt25o00PkJx565eCGTh/bBFas3UXL # JMBjgHyJutGr4cijkNrnQgqWfeTgc32xdVEWh1nZM2K7LslzC9I1PfUzfxRMYqZA # Em0KE3vwQDC7xtIyk4t451hkfcQY8fwN9bDMpD+zbzaLsYTEyOJ900En88iW7oHE # TuJhrviin11jdQCA26QVNXRaw7iIVVo8vJP1VEgbn31iY+Qpcr/HcQRs0x2gex67 # OqIdh4onqkdGCFDxTGUoAH+jORXWUmk/JipIhl9pJP0ZDyAjsm97ThJ6SvctURsK # UMU0dzXEc1C5spD2CWnN0PujqHYQqYaylx7MdiCJMjaCfDB3ZeIRsTGoiLMB24P+ # WBrcn2P+f03nC/sVvxRZWrpyI2kZwEh1RsO/mnLQ3apVBFeKqaFi8Ouo9oi1ZMd6 # ahUw7sZSoTxmGY1FhOSRCGEh2Wjy0ZIOx9tHT1U9vig5Kf9KeE81yO8yaq2T60mq # 9eaUL8rcUrKRiJw9NUkcEYmIUJrh0nUe/kK2RWmbEGMYIH7ASrGqiyUP5FxpekD+ # i/uen4BeyRwe6rnPOzGolg+HMysMBr8VD/8PwJ8g88FLH1jIdTYvFUdRbrkciUlo # okC+y4+kqiU= # =SI8s # -----END PGP SIGNATURE----- # gpg: Signature made Tue 11 Nov 2025 10:23:51 PM CET # gpg: using RSA key DC3DEB159A9AF95D3D7456FE7F09B272C88F2FD6 # gpg: issuer "kwolf@redhat.com" # gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>" [unknown] # gpg: WARNING: The key's User ID is not certified with a trusted signature! # gpg: There is no indication that the signature belongs to the owner. # Primary key fingerprint: DC3D EB15 9A9A F95D 3D74 56FE 7F09 B272 C88F 2FD6 * tag 'for-upstream' of https://repo.or.cz/qemu/kevin: (28 commits) qemu-img rebase: don't exceed IO_BUF_SIZE in one operation qcow2, vmdk: Restrict creation with secondary file using protocol block: Allow drivers to control protocol prefix at creation tests/qemu-iotest: Add more image formats to the thorough testing tests/qemu-iotests: Improve the dry run list to speed up thorough testing tests/qemu-iotests/184: Fix skip message for qemu-img without throttle qcow2: put discards in discard queue when discard-no-unref is enabled qcow2: rename update_refcount_discard to queue_discard iotests: Run iotests with sanitizers qemu-img: Fix amend option parse error handling iotests: Test resizing file node under raw with size/offset block: Drop detach_subchain for bdrv_replace_node block: replace TABs with space block/io_uring: use non-vectored read/write when possible block/io_uring: use aio_add_sqe() aio-posix: add aio_add_sqe() API for user-defined io_uring requests aio-posix: add fdmon_ops->dispatch() aio-posix: unindent fdmon_io_uring_destroy() aio-posix: gracefully handle io_uring_queue_init() failure aio: add errp argument to aio_context_setup() ... Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2025-11-12 11:47:42 +01:00 · 2025-11-12 11:47:42 +01:00 · 9febfa94b6
commit 9febfa94b6
parent 4481234e98 909852ba6b
47 changed files with 1040 additions and 805 deletions
--- a/block.c
+++ b/block.c
@ -693,7 +693,7 @@ out:
 }

 int coroutine_fn bdrv_co_create_file(const char *filename, QemuOpts *opts,
-                                     Error **errp)
+                                     bool allow_protocol_prefix, Error **errp)
 {
    QemuOpts *protocol_opts;
    BlockDriver *drv;
@ -702,7 +702,7 @@ int coroutine_fn bdrv_co_create_file(const char *filename, QemuOpts *opts,

    GLOBAL_STATE_CODE();

-    drv = bdrv_find_protocol(filename, true, errp);
+    drv = bdrv_find_protocol(filename, allow_protocol_prefix, errp);
    if (drv == NULL) {
        return -ENOENT;
    }
@ -5398,17 +5398,13 @@ bdrv_replace_node_noperm(BlockDriverState *from,
 *
 * With auto_skip=false the error is returned if from has a parent which should
 * not be updated.
- *
- * With @detach_subchain=true @to must be in a backing chain of @from. In this
- * case backing link of the cow-parent of @to is removed.
 */
 static int GRAPH_WRLOCK
 bdrv_replace_node_common(BlockDriverState *from, BlockDriverState *to,
-                         bool auto_skip, bool detach_subchain, Error **errp)
+                         bool auto_skip, Error **errp)
 {
    Transaction *tran = tran_new();
    g_autoptr(GSList) refresh_list = NULL;
-    BlockDriverState *to_cow_parent = NULL;
    int ret;

    GLOBAL_STATE_CODE();
@ -5417,17 +5413,6 @@ bdrv_replace_node_common(BlockDriverState *from, BlockDriverState *to,
    assert(to->quiesce_counter);
    assert(bdrv_get_aio_context(from) == bdrv_get_aio_context(to));

-    if (detach_subchain) {
-        assert(bdrv_chain_contains(from, to));
-        assert(from != to);
-        for (to_cow_parent = from;
-             bdrv_filter_or_cow_bs(to_cow_parent) != to;
-             to_cow_parent = bdrv_filter_or_cow_bs(to_cow_parent))
-        {
-            ;
-        }
-    }
-
    /*
     * Do the replacement without permission update.
     * Replacement may influence the permissions, we should calculate new
@ -5439,11 +5424,6 @@ bdrv_replace_node_common(BlockDriverState *from, BlockDriverState *to,
        goto out;
    }

-    if (detach_subchain) {
-        /* to_cow_parent is already drained because from is drained */
-        bdrv_remove_child(bdrv_filter_or_cow_child(to_cow_parent), tran);
-    }
-
    refresh_list = g_slist_prepend(refresh_list, to);
    refresh_list = g_slist_prepend(refresh_list, from);

@ -5462,7 +5442,7 @@ out:
 int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
                      Error **errp)
 {
-    return bdrv_replace_node_common(from, to, true, false, errp);
+    return bdrv_replace_node_common(from, to, true, errp);
 }

 int bdrv_drop_filter(BlockDriverState *bs, Error **errp)
@ -5478,7 +5458,7 @@ int bdrv_drop_filter(BlockDriverState *bs, Error **errp)

    bdrv_drained_begin(child_bs);
    bdrv_graph_wrlock();
-    ret = bdrv_replace_node_common(bs, child_bs, true, true, errp);
+    ret = bdrv_replace_node_common(bs, child_bs, true, errp);
    bdrv_graph_wrunlock();
    bdrv_drained_end(child_bs);

@ -5929,17 +5909,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
        updated_children = g_slist_prepend(updated_children, c);
    }

-    /*
-     * It seems correct to pass detach_subchain=true here, but it triggers
-     * one more yet not fixed bug, when due to nested aio_poll loop we switch to
-     * another drained section, which modify the graph (for example, removing
-     * the child, which we keep in updated_children list). So, it's a TODO.
-     *
-     * Note, bug triggered if pass detach_subchain=true here and run
-     * test-bdrv-drain. test_drop_intermediate_poll() test-case will crash.
-     * That's a FIXME.
-     */
-    bdrv_replace_node_common(top, base, false, false, &local_err);
+    bdrv_replace_node_common(top, base, false, &local_err);
    bdrv_graph_wrunlock();

    if (local_err) {
--- a/block/bochs.c
+++ b/block/bochs.c
@ -300,15 +300,15 @@ static void bochs_close(BlockDriverState *bs)
 }

 static BlockDriver bdrv_bochs = {
-    .format_name	= "bochs",
-    .instance_size	= sizeof(BDRVBochsState),
-    .bdrv_probe		= bochs_probe,
-    .bdrv_open		= bochs_open,
+    .format_name         = "bochs",
+    .instance_size       = sizeof(BDRVBochsState),
+    .bdrv_probe          = bochs_probe,
+    .bdrv_open           = bochs_open,
    .bdrv_child_perm     = bdrv_default_perms,
    .bdrv_refresh_limits = bochs_refresh_limits,
-    .bdrv_co_preadv = bochs_co_preadv,
-    .bdrv_close		= bochs_close,
-    .is_format          = true,
+    .bdrv_co_preadv      = bochs_co_preadv,
+    .bdrv_close          = bochs_close,
+    .is_format           = true,
 };

 static void bdrv_bochs_init(void)
--- a/block/crypto.c
+++ b/block/crypto.c
@ -835,7 +835,7 @@ block_crypto_co_create_opts_luks(BlockDriver *drv, const char *filename,
    }

    /* Create protocol layer */
-    ret = bdrv_co_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, true, errp);
    if (ret < 0) {
        goto fail;
    }
--- a/block/file-posix.c
+++ b/block/file-posix.c
@ -133,7 +133,7 @@
 #define FTYPE_FILE   0
 #define FTYPE_CD     1

-#define MAX_BLOCKSIZE	4096
+#define MAX_BLOCKSIZE 4096

 /* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
 * leaving a few more bytes for its future use. */
@ -755,14 +755,23 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
    }
 #endif /* !defined(CONFIG_LINUX_AIO) */

-#ifndef CONFIG_LINUX_IO_URING
    if (s->use_linux_io_uring) {
+#ifdef CONFIG_LINUX_IO_URING
+        if (!aio_has_io_uring()) {
+            error_setg(errp, "aio=io_uring was specified, but is not "
+                             "available (disabled via io_uring_disabled "
+                             "sysctl or blocked by container runtime "
+                             "seccomp policy?)");
+            ret = -EINVAL;
+            goto fail;
+        }
+#else
        error_setg(errp, "aio=io_uring was specified, but is not supported "
-                         "in this build.");
+                         "in this build");
        ret = -EINVAL;
        goto fail;
-    }
 #endif /* !defined(CONFIG_LINUX_IO_URING) */
+    }

    s->has_discard = true;
    s->has_write_zeroes = true;
@ -2522,27 +2531,6 @@ static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
    return true;
 }

-#ifdef CONFIG_LINUX_IO_URING
-static inline bool raw_check_linux_io_uring(BDRVRawState *s)
-{
-    Error *local_err = NULL;
-    AioContext *ctx;
-
-    if (!s->use_linux_io_uring) {
-        return false;
-    }
-
-    ctx = qemu_get_current_aio_context();
-    if (unlikely(!aio_setup_linux_io_uring(ctx, &local_err))) {
-        error_reportf_err(local_err, "Unable to use linux io_uring, "
-                                     "falling back to thread pool: ");
-        s->use_linux_io_uring = false;
-        return false;
-    }
-    return true;
-}
-#endif
-
 #ifdef CONFIG_LINUX_AIO
 static inline bool raw_check_linux_aio(BDRVRawState *s)
 {
@ -2595,7 +2583,7 @@ raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr, uint64_t bytes,
    if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
        type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_IO_URING
-    } else if (raw_check_linux_io_uring(s)) {
+    } else if (s->use_linux_io_uring) {
        assert(qiov->size == bytes);
        ret = luring_co_submit(bs, s->fd, offset, qiov, type, flags);
        goto out;
@ -2692,7 +2680,7 @@ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
    };

 #ifdef CONFIG_LINUX_IO_URING
-    if (raw_check_linux_io_uring(s)) {
+    if (s->use_linux_io_uring) {
        return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH, 0);
    }
 #endif
@ -4574,20 +4562,20 @@ static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked)
 }

 static BlockDriver bdrv_host_cdrom = {
-    .format_name        = "host_cdrom",
-    .protocol_name      = "host_cdrom",
-    .instance_size      = sizeof(BDRVRawState),
-    .bdrv_needs_filename = true,
-    .bdrv_probe_device	= cdrom_probe_device,
-    .bdrv_parse_filename = cdrom_parse_filename,
-    .bdrv_open          = cdrom_open,
-    .bdrv_close         = raw_close,
-    .bdrv_reopen_prepare = raw_reopen_prepare,
-    .bdrv_reopen_commit  = raw_reopen_commit,
-    .bdrv_reopen_abort   = raw_reopen_abort,
-    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
-    .create_opts         = &bdrv_create_opts_simple,
-    .mutable_opts        = mutable_opts,
+    .format_name            = "host_cdrom",
+    .protocol_name          = "host_cdrom",
+    .instance_size          = sizeof(BDRVRawState),
+    .bdrv_needs_filename    = true,
+    .bdrv_probe_device      = cdrom_probe_device,
+    .bdrv_parse_filename    = cdrom_parse_filename,
+    .bdrv_open              = cdrom_open,
+    .bdrv_close             = raw_close,
+    .bdrv_reopen_prepare    = raw_reopen_prepare,
+    .bdrv_reopen_commit     = raw_reopen_commit,
+    .bdrv_reopen_abort      = raw_reopen_abort,
+    .bdrv_co_create_opts    = bdrv_co_create_opts_simple,
+    .create_opts            = &bdrv_create_opts_simple,
+    .mutable_opts           = mutable_opts,
    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,

    .bdrv_co_preadv         = raw_co_preadv,
@ -4700,20 +4688,20 @@ static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked)
 }

 static BlockDriver bdrv_host_cdrom = {
-    .format_name        = "host_cdrom",
-    .protocol_name      = "host_cdrom",
-    .instance_size      = sizeof(BDRVRawState),
-    .bdrv_needs_filename = true,
-    .bdrv_probe_device	= cdrom_probe_device,
-    .bdrv_parse_filename = cdrom_parse_filename,
-    .bdrv_open          = cdrom_open,
-    .bdrv_close         = raw_close,
-    .bdrv_reopen_prepare = raw_reopen_prepare,
-    .bdrv_reopen_commit  = raw_reopen_commit,
-    .bdrv_reopen_abort   = raw_reopen_abort,
-    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
-    .create_opts         = &bdrv_create_opts_simple,
-    .mutable_opts       = mutable_opts,
+    .format_name            = "host_cdrom",
+    .protocol_name          = "host_cdrom",
+    .instance_size          = sizeof(BDRVRawState),
+    .bdrv_needs_filename    = true,
+    .bdrv_probe_device      = cdrom_probe_device,
+    .bdrv_parse_filename    = cdrom_parse_filename,
+    .bdrv_open              = cdrom_open,
+    .bdrv_close             = raw_close,
+    .bdrv_reopen_prepare    = raw_reopen_prepare,
+    .bdrv_reopen_commit     = raw_reopen_commit,
+    .bdrv_reopen_abort      = raw_reopen_abort,
+    .bdrv_co_create_opts    = bdrv_co_create_opts_simple,
+    .create_opts            = &bdrv_create_opts_simple,
+    .mutable_opts           = mutable_opts,

    .bdrv_co_preadv         = raw_co_preadv,
    .bdrv_co_pwritev        = raw_co_pwritev,
--- a/block/file-win32.c
+++ b/block/file-win32.c
@ -741,16 +741,16 @@ static QemuOptsList raw_create_opts = {
 };

 BlockDriver bdrv_file = {
-    .format_name	= "file",
-    .protocol_name	= "file",
-    .instance_size	= sizeof(BDRVRawState),
-    .bdrv_needs_filename = true,
-    .bdrv_parse_filename = raw_parse_filename,
-    .bdrv_open          = raw_open,
-    .bdrv_refresh_limits = raw_probe_alignment,
-    .bdrv_close         = raw_close,
-    .bdrv_co_create_opts = raw_co_create_opts,
-    .bdrv_has_zero_init = bdrv_has_zero_init_1,
+    .format_name            = "file",
+    .protocol_name          = "file",
+    .instance_size          = sizeof(BDRVRawState),
+    .bdrv_needs_filename    = true,
+    .bdrv_parse_filename    = raw_parse_filename,
+    .bdrv_open              = raw_open,
+    .bdrv_refresh_limits    = raw_probe_alignment,
+    .bdrv_close             = raw_close,
+    .bdrv_co_create_opts    = raw_co_create_opts,
+    .bdrv_has_zero_init     = bdrv_has_zero_init_1,

    .bdrv_reopen_prepare = raw_reopen_prepare,
    .bdrv_reopen_commit  = raw_reopen_commit,
@ -914,15 +914,15 @@ done:
 }

 static BlockDriver bdrv_host_device = {
-    .format_name	= "host_device",
-    .protocol_name	= "host_device",
-    .instance_size	= sizeof(BDRVRawState),
-    .bdrv_needs_filename = true,
-    .bdrv_parse_filename = hdev_parse_filename,
-    .bdrv_probe_device	= hdev_probe_device,
-    .bdrv_open     	= hdev_open,
-    .bdrv_close		= raw_close,
-    .bdrv_refresh_limits = hdev_refresh_limits,
+    .format_name            = "host_device",
+    .protocol_name          = "host_device",
+    .instance_size          = sizeof(BDRVRawState),
+    .bdrv_needs_filename    = true,
+    .bdrv_parse_filename    = hdev_parse_filename,
+    .bdrv_probe_device      = hdev_probe_device,
+    .bdrv_open              = hdev_open,
+    .bdrv_close             = raw_close,
+    .bdrv_refresh_limits    = hdev_refresh_limits,

    .bdrv_aio_preadv    = raw_aio_preadv,
    .bdrv_aio_pwritev   = raw_aio_pwritev,
--- a/block/io_uring.c
+++ b/block/io_uring.c
@ -11,28 +11,20 @@
 #include "qemu/osdep.h"
 #include <liburing.h>
 #include "block/aio.h"
-#include "qemu/queue.h"
 #include "block/block.h"
 #include "block/raw-aio.h"
 #include "qemu/coroutine.h"
-#include "qemu/defer-call.h"
-#include "qapi/error.h"
 #include "system/block-backend.h"
 #include "trace.h"

-/* Only used for assertions.  */
-#include "qemu/coroutine_int.h"
-
-/* io_uring ring size */
-#define MAX_ENTRIES 128
-
-typedef struct LuringAIOCB {
+typedef struct {
    Coroutine *co;
-    struct io_uring_sqe sqeq;
-    ssize_t ret;
    QEMUIOVector *qiov;
-    bool is_read;
-    QSIMPLEQ_ENTRY(LuringAIOCB) next;
+    uint64_t offset;
+    ssize_t ret;
+    int type;
+    int fd;
+    BdrvRequestFlags flags;

    /*
     * Buffered reads may require resubmission, see
@ -40,36 +32,69 @@ typedef struct LuringAIOCB {
     */
    int total_read;
    QEMUIOVector resubmit_qiov;
-} LuringAIOCB;

-typedef struct LuringQueue {
-    unsigned int in_queue;
-    unsigned int in_flight;
-    bool blocked;
-    QSIMPLEQ_HEAD(, LuringAIOCB) submit_queue;
-} LuringQueue;
+    CqeHandler cqe_handler;
+} LuringRequest;

-struct LuringState {
-    AioContext *aio_context;
-
-    struct io_uring ring;
-
-    /* No locking required, only accessed from AioContext home thread */
-    LuringQueue io_q;
-
-    QEMUBH *completion_bh;
-};
-
-/**
- * luring_resubmit:
- *
- * Resubmit a request by appending it to submit_queue.  The caller must ensure
- * that ioq_submit() is called later so that submit_queue requests are started.
- */
-static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb)
+static void luring_prep_sqe(struct io_uring_sqe *sqe, void *opaque)
 {
-    QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
-    s->io_q.in_queue++;
+    LuringRequest *req = opaque;
+    QEMUIOVector *qiov = req->qiov;
+    uint64_t offset = req->offset;
+    int fd = req->fd;
+    BdrvRequestFlags flags = req->flags;
+
+    switch (req->type) {
+    case QEMU_AIO_WRITE:
+    {
+        int luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
+        if (luring_flags != 0 || qiov->niov > 1) {
+#ifdef HAVE_IO_URING_PREP_WRITEV2
+            io_uring_prep_writev2(sqe, fd, qiov->iov,
+                                  qiov->niov, offset, luring_flags);
+#else
+            /*
+             * FUA should only be enabled with HAVE_IO_URING_PREP_WRITEV2, see
+             * luring_has_fua().
+             */
+            assert(luring_flags == 0);
+
+            io_uring_prep_writev(sqe, fd, qiov->iov, qiov->niov, offset);
+#endif
+        } else {
+            /* The man page says non-vectored is faster than vectored */
+            struct iovec *iov = qiov->iov;
+            io_uring_prep_write(sqe, fd, iov->iov_base, iov->iov_len, offset);
+        }
+        break;
+    }
+    case QEMU_AIO_ZONE_APPEND:
+        io_uring_prep_writev(sqe, fd, qiov->iov, qiov->niov, offset);
+        break;
+    case QEMU_AIO_READ:
+    {
+        if (req->resubmit_qiov.iov != NULL) {
+            qiov = &req->resubmit_qiov;
+        }
+        if (qiov->niov > 1) {
+            io_uring_prep_readv(sqe, fd, qiov->iov, qiov->niov,
+                                offset + req->total_read);
+        } else {
+            /* The man page says non-vectored is faster than vectored */
+            struct iovec *iov = qiov->iov;
+            io_uring_prep_read(sqe, fd, iov->iov_base, iov->iov_len,
+                               offset + req->total_read);
+        }
+        break;
+    }
+    case QEMU_AIO_FLUSH:
+        io_uring_prep_fsync(sqe, fd, IORING_FSYNC_DATASYNC);
+        break;
+    default:
+        fprintf(stderr, "%s: invalid AIO request type, aborting 0x%x.\n",
+                        __func__, req->type);
+        abort();
+    }
 }

 /**
@ -78,385 +103,115 @@ static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb)
 * Short reads are rare but may occur. The remaining read request needs to be
 * resubmitted.
 */
-static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb,
-                                       int nread)
+static void luring_resubmit_short_read(LuringRequest *req, int nread)
 {
    QEMUIOVector *resubmit_qiov;
    size_t remaining;

-    trace_luring_resubmit_short_read(s, luringcb, nread);
+    trace_luring_resubmit_short_read(req, nread);

    /* Update read position */
-    luringcb->total_read += nread;
-    remaining = luringcb->qiov->size - luringcb->total_read;
+    req->total_read += nread;
+    remaining = req->qiov->size - req->total_read;

    /* Shorten qiov */
-    resubmit_qiov = &luringcb->resubmit_qiov;
+    resubmit_qiov = &req->resubmit_qiov;
    if (resubmit_qiov->iov == NULL) {
-        qemu_iovec_init(resubmit_qiov, luringcb->qiov->niov);
+        qemu_iovec_init(resubmit_qiov, req->qiov->niov);
    } else {
        qemu_iovec_reset(resubmit_qiov);
    }
-    qemu_iovec_concat(resubmit_qiov, luringcb->qiov, luringcb->total_read,
-                      remaining);
+    qemu_iovec_concat(resubmit_qiov, req->qiov, req->total_read, remaining);

-    /* Update sqe */
-    luringcb->sqeq.off += nread;
-    luringcb->sqeq.addr = (uintptr_t)luringcb->resubmit_qiov.iov;
-    luringcb->sqeq.len = luringcb->resubmit_qiov.niov;
-
-    luring_resubmit(s, luringcb);
+    aio_add_sqe(luring_prep_sqe, req, &req->cqe_handler);
 }

-/**
- * luring_process_completions:
- * @s: AIO state
- *
- * Fetches completed I/O requests, consumes cqes and invokes their callbacks
- * The function is somewhat tricky because it supports nested event loops, for
- * example when a request callback invokes aio_poll().
- *
- * Function schedules BH completion so it  can be called again in a nested
- * event loop.  When there are no events left  to complete the BH is being
- * canceled.
- *
- */
-static void luring_process_completions(LuringState *s)
+static void luring_cqe_handler(CqeHandler *cqe_handler)
 {
-    struct io_uring_cqe *cqes;
-    int total_bytes;
+    LuringRequest *req = container_of(cqe_handler, LuringRequest, cqe_handler);
+    int ret = cqe_handler->cqe.res;

-    defer_call_begin();
+    trace_luring_cqe_handler(req, ret);

-    /*
-     * Request completion callbacks can run the nested event loop.
-     * Schedule ourselves so the nested event loop will "see" remaining
-     * completed requests and process them.  Without this, completion
-     * callbacks that wait for other requests using a nested event loop
-     * would hang forever.
-     *
-     * This workaround is needed because io_uring uses poll_wait, which
-     * is woken up when new events are added to the uring, thus polling on
-     * the same uring fd will block unless more events are received.
-     *
-     * Other leaf block drivers (drivers that access the data themselves)
-     * are networking based, so they poll sockets for data and run the
-     * correct coroutine.
-     */
-    qemu_bh_schedule(s->completion_bh);
-
-    while (io_uring_peek_cqe(&s->ring, &cqes) == 0) {
-        LuringAIOCB *luringcb;
-        int ret;
-
-        if (!cqes) {
-            break;
+    if (ret < 0) {
+        /*
+         * Only writev/readv/fsync requests on regular files or host block
+         * devices are submitted. Therefore -EAGAIN is not expected but it's
+         * known to happen sometimes with Linux SCSI. Submit again and hope
+         * the request completes successfully.
+         *
+         * For more information, see:
+         * https://lore.kernel.org/io-uring/20210727165811.284510-3-axboe@kernel.dk/T/#u
+         *
+         * If the code is changed to submit other types of requests in the
+         * future, then this workaround may need to be extended to deal with
+         * genuine -EAGAIN results that should not be resubmitted
+         * immediately.
+         */
+        if (ret == -EINTR || ret == -EAGAIN) {
+            aio_add_sqe(luring_prep_sqe, req, &req->cqe_handler);
+            return;
        }
-
-        luringcb = io_uring_cqe_get_data(cqes);
-        ret = cqes->res;
-        io_uring_cqe_seen(&s->ring, cqes);
-        cqes = NULL;
-
-        /* Change counters one-by-one because we can be nested. */
-        s->io_q.in_flight--;
-        trace_luring_process_completion(s, luringcb, ret);
-
+    } else if (req->qiov) {
        /* total_read is non-zero only for resubmitted read requests */
-        total_bytes = ret + luringcb->total_read;
+        int total_bytes = ret + req->total_read;

-        if (ret < 0) {
-            /*
-             * Only writev/readv/fsync requests on regular files or host block
-             * devices are submitted. Therefore -EAGAIN is not expected but it's
-             * known to happen sometimes with Linux SCSI. Submit again and hope
-             * the request completes successfully.
-             *
-             * For more information, see:
-             * https://lore.kernel.org/io-uring/20210727165811.284510-3-axboe@kernel.dk/T/#u
-             *
-             * If the code is changed to submit other types of requests in the
-             * future, then this workaround may need to be extended to deal with
-             * genuine -EAGAIN results that should not be resubmitted
-             * immediately.
-             */
-            if (ret == -EINTR || ret == -EAGAIN) {
-                luring_resubmit(s, luringcb);
-                continue;
-            }
-        } else if (!luringcb->qiov) {
-            goto end;
-        } else if (total_bytes == luringcb->qiov->size) {
+        if (total_bytes == req->qiov->size) {
            ret = 0;
-        /* Only read/write */
        } else {
            /* Short Read/Write */
-            if (luringcb->is_read) {
+            if (req->type == QEMU_AIO_READ) {
                if (ret > 0) {
-                    luring_resubmit_short_read(s, luringcb, ret);
-                    continue;
-                } else {
-                    /* Pad with zeroes */
-                    qemu_iovec_memset(luringcb->qiov, total_bytes, 0,
-                                      luringcb->qiov->size - total_bytes);
-                    ret = 0;
+                    luring_resubmit_short_read(req, ret);
+                    return;
                }
+
+                /* Pad with zeroes */
+                qemu_iovec_memset(req->qiov, total_bytes, 0,
+                                  req->qiov->size - total_bytes);
+                ret = 0;
            } else {
                ret = -ENOSPC;
            }
        }
-end:
-        luringcb->ret = ret;
-        qemu_iovec_destroy(&luringcb->resubmit_qiov);
-
-        /*
-         * If the coroutine is already entered it must be in ioq_submit()
-         * and will notice luringcb->ret has been filled in when it
-         * eventually runs later. Coroutines cannot be entered recursively
-         * so avoid doing that!
-         */
-        assert(luringcb->co->ctx == s->aio_context);
-        if (!qemu_coroutine_entered(luringcb->co)) {
-            aio_co_wake(luringcb->co);
-        }
    }

-    qemu_bh_cancel(s->completion_bh);
+    req->ret = ret;
+    qemu_iovec_destroy(&req->resubmit_qiov);

-    defer_call_end();
-}
-
-static int ioq_submit(LuringState *s)
-{
-    int ret = 0;
-    LuringAIOCB *luringcb, *luringcb_next;
-
-    while (s->io_q.in_queue > 0) {
-        /*
-         * Try to fetch sqes from the ring for requests waiting in
-         * the overflow queue
-         */
-        QSIMPLEQ_FOREACH_SAFE(luringcb, &s->io_q.submit_queue, next,
-                              luringcb_next) {
-            struct io_uring_sqe *sqes = io_uring_get_sqe(&s->ring);
-            if (!sqes) {
-                break;
-            }
-            /* Prep sqe for submission */
-            *sqes = luringcb->sqeq;
-            QSIMPLEQ_REMOVE_HEAD(&s->io_q.submit_queue, next);
-        }
-        ret = io_uring_submit(&s->ring);
-        trace_luring_io_uring_submit(s, ret);
-        /* Prevent infinite loop if submission is refused */
-        if (ret <= 0) {
-            if (ret == -EAGAIN || ret == -EINTR) {
-                continue;
-            }
-            break;
-        }
-        s->io_q.in_flight += ret;
-        s->io_q.in_queue  -= ret;
-    }
-    s->io_q.blocked = (s->io_q.in_queue > 0);
-
-    if (s->io_q.in_flight) {
-        /*
-         * We can try to complete something just right away if there are
-         * still requests in-flight.
-         */
-        luring_process_completions(s);
-    }
-    return ret;
-}
-
-static void luring_process_completions_and_submit(LuringState *s)
-{
-    luring_process_completions(s);
-
-    if (s->io_q.in_queue > 0) {
-        ioq_submit(s);
+    /*
+     * If the coroutine is already entered it must be in luring_co_submit() and
+     * will notice req->ret has been filled in when it eventually runs later.
+     * Coroutines cannot be entered recursively so avoid doing that!
+     */
+    if (!qemu_coroutine_entered(req->co)) {
+        aio_co_wake(req->co);
    }
 }

-static void qemu_luring_completion_bh(void *opaque)
+int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd,
+                                  uint64_t offset, QEMUIOVector *qiov,
+                                  int type, BdrvRequestFlags flags)
 {
-    LuringState *s = opaque;
-    luring_process_completions_and_submit(s);
-}
-
-static void qemu_luring_completion_cb(void *opaque)
-{
-    LuringState *s = opaque;
-    luring_process_completions_and_submit(s);
-}
-
-static bool qemu_luring_poll_cb(void *opaque)
-{
-    LuringState *s = opaque;
-
-    return io_uring_cq_ready(&s->ring);
-}
-
-static void qemu_luring_poll_ready(void *opaque)
-{
-    LuringState *s = opaque;
-
-    luring_process_completions_and_submit(s);
-}
-
-static void ioq_init(LuringQueue *io_q)
-{
-    QSIMPLEQ_INIT(&io_q->submit_queue);
-    io_q->in_queue = 0;
-    io_q->in_flight = 0;
-    io_q->blocked = false;
-}
-
-static void luring_deferred_fn(void *opaque)
-{
-    LuringState *s = opaque;
-    trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue,
-                           s->io_q.in_flight);
-    if (!s->io_q.blocked && s->io_q.in_queue > 0) {
-        ioq_submit(s);
-    }
-}
-
-/**
- * luring_do_submit:
- * @fd: file descriptor for I/O
- * @luringcb: AIO control block
- * @s: AIO state
- * @offset: offset for request
- * @type: type of request
- *
- * Fetches sqes from ring, adds to pending queue and preps them
- *
- */
-static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
-                            uint64_t offset, int type, BdrvRequestFlags flags)
-{
-    int ret;
-    struct io_uring_sqe *sqes = &luringcb->sqeq;
-
-    switch (type) {
-    case QEMU_AIO_WRITE:
-#ifdef HAVE_IO_URING_PREP_WRITEV2
-    {
-        int luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
-        io_uring_prep_writev2(sqes, fd, luringcb->qiov->iov,
-                              luringcb->qiov->niov, offset, luring_flags);
-    }
-#else
-        assert(flags == 0);
-        io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
-                             luringcb->qiov->niov, offset);
-#endif
-        break;
-    case QEMU_AIO_ZONE_APPEND:
-        io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
-                             luringcb->qiov->niov, offset);
-        break;
-    case QEMU_AIO_READ:
-        io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
-                            luringcb->qiov->niov, offset);
-        break;
-    case QEMU_AIO_FLUSH:
-        io_uring_prep_fsync(sqes, fd, IORING_FSYNC_DATASYNC);
-        break;
-    default:
-        fprintf(stderr, "%s: invalid AIO request type, aborting 0x%x.\n",
-                        __func__, type);
-        abort();
-    }
-    io_uring_sqe_set_data(sqes, luringcb);
-
-    QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
-    s->io_q.in_queue++;
-    trace_luring_do_submit(s, s->io_q.blocked, s->io_q.in_queue,
-                           s->io_q.in_flight);
-    if (!s->io_q.blocked) {
-        if (s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES) {
-            ret = ioq_submit(s);
-            trace_luring_do_submit_done(s, ret);
-            return ret;
-        }
-
-        defer_call(luring_deferred_fn, s);
-    }
-    return 0;
-}
-
-int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
-                                  QEMUIOVector *qiov, int type,
-                                  BdrvRequestFlags flags)
-{
-    int ret;
-    AioContext *ctx = qemu_get_current_aio_context();
-    LuringState *s = aio_get_linux_io_uring(ctx);
-    LuringAIOCB luringcb = {
+    LuringRequest req = {
        .co         = qemu_coroutine_self(),
-        .ret        = -EINPROGRESS,
        .qiov       = qiov,
-        .is_read    = (type == QEMU_AIO_READ),
+        .ret        = -EINPROGRESS,
+        .type       = type,
+        .fd         = fd,
+        .offset     = offset,
+        .flags      = flags,
    };
-    trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0,
-                           type);
-    ret = luring_do_submit(fd, &luringcb, s, offset, type, flags);

-    if (ret < 0) {
-        return ret;
-    }
+    req.cqe_handler.cb = luring_cqe_handler;

-    if (luringcb.ret == -EINPROGRESS) {
+    trace_luring_co_submit(bs, &req, fd, offset, qiov ? qiov->size : 0, type);
+    aio_add_sqe(luring_prep_sqe, &req, &req.cqe_handler);
+
+    if (req.ret == -EINPROGRESS) {
        qemu_coroutine_yield();
    }
-    return luringcb.ret;
-}
-
-void luring_detach_aio_context(LuringState *s, AioContext *old_context)
-{
-    aio_set_fd_handler(old_context, s->ring.ring_fd,
-                       NULL, NULL, NULL, NULL, s);
-    qemu_bh_delete(s->completion_bh);
-    s->aio_context = NULL;
-}
-
-void luring_attach_aio_context(LuringState *s, AioContext *new_context)
-{
-    s->aio_context = new_context;
-    s->completion_bh = aio_bh_new(new_context, qemu_luring_completion_bh, s);
-    aio_set_fd_handler(s->aio_context, s->ring.ring_fd,
-                       qemu_luring_completion_cb, NULL,
-                       qemu_luring_poll_cb, qemu_luring_poll_ready, s);
-}
-
-LuringState *luring_init(Error **errp)
-{
-    int rc;
-    LuringState *s = g_new0(LuringState, 1);
-    struct io_uring *ring = &s->ring;
-
-    trace_luring_init_state(s, sizeof(*s));
-
-    rc = io_uring_queue_init(MAX_ENTRIES, ring, 0);
-    if (rc < 0) {
-        error_setg_errno(errp, -rc, "failed to init linux io_uring ring");
-        g_free(s);
-        return NULL;
-    }
-
-    ioq_init(&s->io_q);
-    return s;
-
-}
-
-void luring_cleanup(LuringState *s)
-{
-    io_uring_queue_exit(&s->ring);
-    trace_luring_cleanup_state(s);
-    g_free(s);
+    return req.ret;
 }

 bool luring_has_fua(void)
--- a/block/parallels.c
+++ b/block/parallels.c
@ -1117,7 +1117,7 @@ parallels_co_create_opts(BlockDriver *drv, const char *filename,
    }

    /* Create and open the file (protocol layer) */
-    ret = bdrv_co_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, true, errp);
    if (ret < 0) {
        goto done;
    }
--- a/block/qcow.c
+++ b/block/qcow.c
@ -978,7 +978,7 @@ qcow_co_create_opts(BlockDriver *drv, const char *filename,
    }

    /* Create and open the file (protocol layer) */
-    ret = bdrv_co_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, true, errp);
    if (ret < 0) {
        goto fail;
    }
@ -1184,11 +1184,11 @@ static const char *const qcow_strong_runtime_opts[] = {
 };

 static BlockDriver bdrv_qcow = {
-    .format_name	= "qcow",
-    .instance_size	= sizeof(BDRVQcowState),
-    .bdrv_probe		= qcow_probe,
-    .bdrv_open		= qcow_open,
-    .bdrv_close		= qcow_close,
+    .format_name            = "qcow",
+    .instance_size          = sizeof(BDRVQcowState),
+    .bdrv_probe             = qcow_probe,
+    .bdrv_open              = qcow_open,
+    .bdrv_close             = qcow_close,
    .bdrv_child_perm        = bdrv_default_perms,
    .bdrv_reopen_prepare    = qcow_reopen_prepare,
    .bdrv_co_create         = qcow_co_create,
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@ -1978,12 +1978,10 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters,
        if (!keep_reference) {
            /* Then decrease the refcount */
            qcow2_free_any_cluster(bs, old_l2_entry, type);
-        } else if (s->discard_passthrough[type] &&
-                   (cluster_type == QCOW2_CLUSTER_NORMAL ||
-                    cluster_type == QCOW2_CLUSTER_ZERO_ALLOC)) {
+        } else {
            /* If we keep the reference, pass on the discard still */
-            bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
-                          s->cluster_size);
+            qcow2_discard_cluster(bs, old_l2_entry & L2E_OFFSET_MASK,
+                                  s->cluster_size, cluster_type, type);
        }
    }

@ -2092,12 +2090,10 @@ zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
            if (!keep_reference) {
                /* Then decrease the refcount */
                qcow2_free_any_cluster(bs, old_l2_entry, QCOW2_DISCARD_REQUEST);
-            } else if (s->discard_passthrough[QCOW2_DISCARD_REQUEST] &&
-                       (type == QCOW2_CLUSTER_NORMAL ||
-                        type == QCOW2_CLUSTER_ZERO_ALLOC)) {
+            } else {
                /* If we keep the reference, pass on the discard still */
-                bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
-                            s->cluster_size);
+                qcow2_discard_cluster(bs, old_l2_entry & L2E_OFFSET_MASK,
+                            s->cluster_size, type, QCOW2_DISCARD_REQUEST);
            }
        }
    }
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@ -754,8 +754,8 @@ void qcow2_process_discards(BlockDriverState *bs, int ret)
    }
 }

-static void update_refcount_discard(BlockDriverState *bs,
-                                    uint64_t offset, uint64_t length)
+static void queue_discard(BlockDriverState *bs,
+                          uint64_t offset, uint64_t length)
 {
    BDRVQcow2State *s = bs->opaque;
    Qcow2DiscardRegion *d, *p, *next;
@ -902,7 +902,7 @@ update_refcount(BlockDriverState *bs, int64_t offset, int64_t length,
            }

            if (s->discard_passthrough[type]) {
-                update_refcount_discard(bs, cluster_offset, s->cluster_size);
+                queue_discard(bs, cluster_offset, s->cluster_size);
            }
        }
    }
@ -1205,6 +1205,23 @@ void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
    }
 }

+void qcow2_discard_cluster(BlockDriverState *bs, uint64_t offset,
+                           uint64_t length, QCow2ClusterType ctype,
+                           enum qcow2_discard_type dtype)
+{
+    BDRVQcow2State *s = bs->opaque;
+
+    if (s->discard_passthrough[dtype] &&
+        (ctype == QCOW2_CLUSTER_NORMAL ||
+         ctype == QCOW2_CLUSTER_ZERO_ALLOC)) {
+        if (has_data_file(bs)) {
+            bdrv_pdiscard(s->data_file, offset, length);
+        } else {
+            queue_discard(bs, offset, length);
+        }
+    }
+}
+
 int qcow2_write_caches(BlockDriverState *bs)
 {
    BDRVQcow2State *s = bs->opaque;
@ -3619,7 +3636,7 @@ qcow2_discard_refcount_block(BlockDriverState *bs, uint64_t discard_block_offs)
        /* discard refblock from the cache if refblock is cached */
        qcow2_cache_discard(s->refcount_block_cache, refblock);
    }
-    update_refcount_discard(bs, discard_block_offs, s->cluster_size);
+    queue_discard(bs, discard_block_offs, s->cluster_size);

    return 0;
 }
--- a/block/qcow2.c
+++ b/block/qcow2.c
@ -3956,7 +3956,7 @@ qcow2_co_create_opts(BlockDriver *drv, const char *filename, QemuOpts *opts,
    }

    /* Create and open the file (protocol layer) */
-    ret = bdrv_co_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, true, errp);
    if (ret < 0) {
        goto finish;
    }
@ -3971,7 +3971,7 @@ qcow2_co_create_opts(BlockDriver *drv, const char *filename, QemuOpts *opts,
    /* Create and open an external data file (protocol layer) */
    val = qdict_get_try_str(qdict, BLOCK_OPT_DATA_FILE);
    if (val) {
-        ret = bdrv_co_create_file(val, opts, errp);
+        ret = bdrv_co_create_file(val, opts, false, errp);
        if (ret < 0) {
            goto finish;
        }
--- a/block/qcow2.h
+++ b/block/qcow2.h
@ -880,6 +880,10 @@ void GRAPH_RDLOCK qcow2_free_clusters(BlockDriverState *bs,
 void GRAPH_RDLOCK
 qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
                       enum qcow2_discard_type type);
+void GRAPH_RDLOCK
+qcow2_discard_cluster(BlockDriverState *bs, uint64_t offset,
+                      uint64_t length, QCow2ClusterType ctype,
+                      enum qcow2_discard_type dtype);

 int GRAPH_RDLOCK
 qcow2_update_snapshot_refcount(BlockDriverState *bs, int64_t l1_table_offset,
--- a/block/qed.c
+++ b/block/qed.c
@ -788,7 +788,7 @@ bdrv_qed_co_create_opts(BlockDriver *drv, const char *filename,
    }

    /* Create and open the file (protocol layer) */
-    ret = bdrv_co_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, true, errp);
    if (ret < 0) {
        goto fail;
    }
--- a/block/raw-format.c
+++ b/block/raw-format.c
@ -463,7 +463,7 @@ static int coroutine_fn GRAPH_UNLOCKED
 raw_co_create_opts(BlockDriver *drv, const char *filename,
                   QemuOpts *opts, Error **errp)
 {
-    return bdrv_co_create_file(filename, opts, errp);
+    return bdrv_co_create_file(filename, opts, true, errp);
 }

 static int raw_open(BlockDriverState *bs, QDict *options, int flags,
--- a/block/trace-events
+++ b/block/trace-events
@ -62,15 +62,9 @@ qmp_block_stream(void *bs) "bs %p"
 file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d"

 # io_uring.c
-luring_init_state(void *s, size_t size) "s %p size %zu"
-luring_cleanup_state(void *s) "%p freed"
-luring_unplug_fn(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
-luring_do_submit(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
-luring_do_submit_done(void *s, int ret) "LuringState %p submitted to kernel %d"
-luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, size_t nbytes, int type) "bs %p s %p luringcb %p fd %d offset %" PRId64 " nbytes %zd type %d"
-luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d"
-luring_io_uring_submit(void *s, int ret) "LuringState %p ret %d"
-luring_resubmit_short_read(void *s, void *luringcb, int nread) "LuringState %p luringcb %p nread %d"
+luring_cqe_handler(void *req, int ret) "req %p ret %d"
+luring_co_submit(void *bs, void *req, int fd, uint64_t offset, size_t nbytes, int type) "bs %p req %p fd %d offset %" PRId64 " nbytes %zd type %d"
+luring_resubmit_short_read(void *req, int nread) "req %p nread %d"

 # qcow2.c
 qcow2_add_task(void *co, void *bs, void *pool, const char *action, int cluster_type, uint64_t host_offset, uint64_t offset, uint64_t bytes, void *qiov, size_t qiov_offset) "co %p bs %p pool %p: %s: cluster_type %d file_cluster_offset %" PRIu64 " offset %" PRIu64 " bytes %" PRIu64 " qiov %p qiov_offset %zu"
--- a/block/vdi.c
+++ b/block/vdi.c
@ -938,7 +938,7 @@ vdi_co_create_opts(BlockDriver *drv, const char *filename,
    qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vdi_create_opts, true);

    /* Create and open the file (protocol layer) */
-    ret = bdrv_co_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, true, errp);
    if (ret < 0) {
        goto done;
    }
--- a/block/vhdx.c
+++ b/block/vhdx.c
@ -2096,7 +2096,7 @@ vhdx_co_create_opts(BlockDriver *drv, const char *filename,
    }

    /* Create and open the file (protocol layer) */
-    ret = bdrv_co_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, true, errp);
    if (ret < 0) {
        goto fail;
    }
--- a/block/vmdk.c
+++ b/block/vmdk.c
@ -2334,7 +2334,7 @@ vmdk_create_extent(const char *filename, int64_t filesize, bool flat,
    int ret;
    BlockBackend *blk = NULL;

-    ret = bdrv_co_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, false, errp);
    if (ret < 0) {
        goto exit;
    }
--- a/block/vpc.c
+++ b/block/vpc.c
@ -1118,7 +1118,7 @@ vpc_co_create_opts(BlockDriver *drv, const char *filename,
    }

    /* Create and open the file (protocol layer) */
-    ret = bdrv_co_create_file(filename, opts, errp);
+    ret = bdrv_co_create_file(filename, opts, true, errp);
    if (ret < 0) {
        goto fail;
    }
--- a/include/block/aio.h
+++ b/include/block/aio.h
@ -61,6 +61,27 @@ typedef struct LuringState LuringState;
 /* Is polling disabled? */
 bool aio_poll_disabled(AioContext *ctx);

+#ifdef CONFIG_LINUX_IO_URING
+/*
+ * Each io_uring request must have a unique CqeHandler that processes the cqe.
+ * The lifetime of a CqeHandler must be at least from aio_add_sqe() until
+ * ->cb() invocation.
+ */
+typedef struct CqeHandler CqeHandler;
+struct CqeHandler {
+    /* Called by the AioContext when the request has completed */
+    void (*cb)(CqeHandler *handler);
+
+    /* Used internally, do not access this */
+    QSIMPLEQ_ENTRY(CqeHandler) next;
+
+    /* This field is filled in before ->cb() is called */
+    struct io_uring_cqe cqe;
+};
+
+typedef QSIMPLEQ_HEAD(, CqeHandler) CqeHandlerSimpleQ;
+#endif /* CONFIG_LINUX_IO_URING */
+
 /* Callbacks for file descriptor monitoring implementations */
 typedef struct {
    /*
@ -106,6 +127,78 @@ typedef struct {
     * Returns: true if ->wait() should be called, false otherwise.
     */
    bool (*need_wait)(AioContext *ctx);
+
+    /*
+     * dispatch:
+     * @ctx: the AioContext
+     *
+     * Dispatch any work that is specific to this file descriptor monitoring
+     * implementation. Usually the event loop's generic file descriptor
+     * monitoring, BH, and timer dispatching code is sufficient, but file
+     * descriptor monitoring implementations offering additional functionality
+     * may need to implement this function for custom behavior. Called at a
+     * point in the event loop when it is safe to invoke user-defined
+     * callbacks.
+     *
+     * This function is optional and may be NULL.
+     *
+     * Returns: true if progress was made (see aio_poll()'s return value),
+     * false otherwise.
+     */
+    bool (*dispatch)(AioContext *ctx);
+
+    /*
+     * gsource_prepare:
+     * @ctx: the AioContext
+     *
+     * Prepare for the glib event loop to wait for events instead of the usual
+     * ->wait() call. See glib's GSourceFuncs->prepare().
+     */
+    void (*gsource_prepare)(AioContext *ctx);
+
+    /*
+     * gsource_check:
+     * @ctx: the AioContext
+     *
+     * Called by the glib event loop from glib's GSourceFuncs->check() after
+     * waiting for events.
+     *
+     * Returns: true when ready to be dispatched.
+     */
+    bool (*gsource_check)(AioContext *ctx);
+
+    /*
+     * gsource_dispatch:
+     * @ctx: the AioContext
+     * @ready_list: list for handlers that become ready
+     *
+     * Place ready AioHandlers on ready_list. Called as part of the glib event
+     * loop from glib's GSourceFuncs->dispatch().
+     *
+     * Called with list_lock incremented.
+     */
+    void (*gsource_dispatch)(AioContext *ctx, AioHandlerList *ready_list);
+
+#ifdef CONFIG_LINUX_IO_URING
+    /**
+     * add_sqe: Add an io_uring sqe for submission.
+     * @prep_sqe: invoked with an sqe that should be prepared for submission
+     * @opaque: user-defined argument to @prep_sqe()
+     * @cqe_handler: the unique cqe handler associated with this request
+     *
+     * The caller's @prep_sqe() function is invoked to fill in the details of
+     * the sqe. Do not call io_uring_sqe_set_data() on this sqe.
+     *
+     * The kernel may see the sqe as soon as @prep_sqe() returns or it may take
+     * until the next event loop iteration.
+     *
+     * This function is called from the current AioContext and is not
+     * thread-safe.
+     */
+    void (*add_sqe)(AioContext *ctx,
+                    void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
+                    void *opaque, CqeHandler *cqe_handler);
+#endif /* CONFIG_LINUX_IO_URING */
 } FDMonOps;

 /*
@ -217,12 +310,14 @@ struct AioContext {
    struct LinuxAioState *linux_aio;
 #endif
 #ifdef CONFIG_LINUX_IO_URING
-    LuringState *linux_io_uring;
-
    /* State for file descriptor monitoring using Linux io_uring */
    struct io_uring fdmon_io_uring;
    AioHandlerSList submit_list;
-#endif
+    void *io_uring_fd_tag;
+
+    /* Pending callback state for cqe handlers */
+    CqeHandlerSimpleQ cqe_handler_ready_list;
+#endif /* CONFIG_LINUX_IO_URING */

    /* TimerLists for calling timers - one per clock type.  Has its own
     * locking.
@ -254,7 +349,13 @@ struct AioContext {
    /* epoll(7) state used when built with CONFIG_EPOLL */
    int epollfd;

+    /* The GSource unix fd tag for epollfd */
+    void *epollfd_tag;
+
    const FDMonOps *fdmon_ops;
+
+    /* Was aio_context_new() successful? */
+    bool initialized;
 };

 /**
@ -512,11 +613,6 @@ struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp);
 /* Return the LinuxAioState bound to this AioContext */
 struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);

-/* Setup the LuringState bound to this AioContext */
-LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp);
-
-/* Return the LuringState bound to this AioContext */
-LuringState *aio_get_linux_io_uring(AioContext *ctx);
 /**
 * aio_timer_new_with_attrs:
 * @ctx: the aio context
@ -679,10 +775,13 @@ void qemu_set_current_aio_context(AioContext *ctx);
 /**
 * aio_context_setup:
 * @ctx: the aio context
+ * @errp: error pointer
 *
 * Initialize the aio context.
+ *
+ * Returns: true on success, false otherwise
 */
-void aio_context_setup(AioContext *ctx);
+bool aio_context_setup(AioContext *ctx, Error **errp);

 /**
 * aio_context_destroy:
@ -692,9 +791,6 @@ void aio_context_setup(AioContext *ctx);
 */
 void aio_context_destroy(AioContext *ctx);

-/* Used internally, do not call outside AioContext code */
-void aio_context_use_g_source(AioContext *ctx);
-
 /**
 * aio_context_set_poll_params:
 * @ctx: the aio context
@ -724,4 +820,40 @@ void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch);
 */
 void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
                                        int64_t max, Error **errp);
+
+#ifdef CONFIG_LINUX_IO_URING
+/**
+ * aio_has_io_uring: Return whether io_uring is available.
+ *
+ * io_uring is either available in all AioContexts or in none, so this only
+ * needs to be called once from within any thread's AioContext.
+ */
+static inline bool aio_has_io_uring(void)
+{
+    AioContext *ctx = qemu_get_current_aio_context();
+    return ctx->fdmon_ops->add_sqe;
+}
+
+/**
+ * aio_add_sqe: Add an io_uring sqe for submission.
+ * @prep_sqe: invoked with an sqe that should be prepared for submission
+ * @opaque: user-defined argument to @prep_sqe()
+ * @cqe_handler: the unique cqe handler associated with this request
+ *
+ * The caller's @prep_sqe() function is invoked to fill in the details of the
+ * sqe. Do not call io_uring_sqe_set_data() on this sqe.
+ *
+ * The sqe is submitted by the current AioContext. The kernel may see the sqe
+ * as soon as @prep_sqe() returns or it may take until the next event loop
+ * iteration.
+ *
+ * When the AioContext is destroyed, pending sqes are ignored and their
+ * CqeHandlers are not invoked.
+ *
+ * This function must be called only when aio_has_io_uring() returns true.
+ */
+void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
+                 void *opaque, CqeHandler *cqe_handler);
+#endif /* CONFIG_LINUX_IO_URING */
+
 #endif
--- a/include/block/block-global-state.h
+++ b/include/block/block-global-state.h
@ -65,7 +65,8 @@ int co_wrapper bdrv_create(BlockDriver *drv, const char *filename,
                           QemuOpts *opts, Error **errp);

 int coroutine_fn GRAPH_UNLOCKED
-bdrv_co_create_file(const char *filename, QemuOpts *opts, Error **errp);
+bdrv_co_create_file(const char *filename, QemuOpts *opts,
+                    bool allow_protocol_prefix, Error **errp);

 BlockDriverState *bdrv_new(void);
 int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@ -296,7 +296,7 @@ enum {
    NBD_CMD_BLOCK_STATUS = 7,
 };

-#define NBD_DEFAULT_PORT	10809
+#define NBD_DEFAULT_PORT 10809

 /* Maximum size of a single READ/WRITE data buffer */
 #define NBD_MAX_BUFFER_SIZE (32 * 1024 * 1024)
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@ -74,15 +74,10 @@ static inline bool laio_has_fua(void)
 #endif
 /* io_uring.c - Linux io_uring implementation */
 #ifdef CONFIG_LINUX_IO_URING
-LuringState *luring_init(Error **errp);
-void luring_cleanup(LuringState *s);
-
 /* luring_co_submit: submit I/O requests in the thread's current AioContext. */
 int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
                                  QEMUIOVector *qiov, int type,
                                  BdrvRequestFlags flags);
-void luring_detach_aio_context(LuringState *s, AioContext *old_context);
-void luring_attach_aio_context(LuringState *s, AioContext *new_context);
 bool luring_has_fua(void);
 #else
 static inline bool luring_has_fua(void)
--- a/meson.build
+++ b/meson.build
@ -2745,6 +2745,8 @@ endif
 if linux_io_uring.found()
  config_host_data.set('HAVE_IO_URING_PREP_WRITEV2',
                       cc.has_header_symbol('liburing.h', 'io_uring_prep_writev2'))
+  config_host_data.set('HAVE_IO_URING_CQ_HAS_OVERFLOW',
+                       cc.has_header_symbol('liburing.h', 'io_uring_cq_has_overflow'))
 endif
 config_host_data.set('HAVE_TCP_KEEPCNT',
                     cc.has_header_symbol('netinet/tcp.h', 'TCP_KEEPCNT') or
--- a/qemu-img.c
+++ b/qemu-img.c
@ -4081,7 +4081,7 @@ static int img_rebase(const img_cmd_t *ccmd, int argc, char **argv)
            n += offset - QEMU_ALIGN_DOWN(offset, write_align);
            offset = QEMU_ALIGN_DOWN(offset, write_align);
            n += QEMU_ALIGN_UP(offset + n, write_align) - (offset + n);
-            n = MIN(n, size - offset);
+            n = MIN(n, MIN(size - offset, IO_BUF_SIZE));
            assert(!bdrv_is_allocated(unfiltered_bs, offset, n, &n_alloc) &&
                   n_alloc == n);

@ -4597,9 +4597,9 @@ static int img_amend(const img_cmd_t *ccmd, int argc, char **argv)
    amend_opts = qemu_opts_append(amend_opts, bs->drv->amend_opts);
    opts = qemu_opts_create(amend_opts, NULL, 0, &error_abort);
    if (!qemu_opts_do_parse(opts, options, NULL, &err)) {
+        qemu_opts_del(opts);
        /* Try to parse options using the create options */
        amend_opts = qemu_opts_append(amend_opts, bs->drv->create_opts);
-        qemu_opts_del(opts);
        opts = qemu_opts_create(amend_opts, NULL, 0, &error_abort);
        if (qemu_opts_do_parse(opts, options, NULL, NULL)) {
            error_append_hint(&err,
--- a/stubs/io_uring.c
+++ b/stubs/io_uring.c
@ -1,32 +0,0 @@
-/*
- * Linux io_uring support.
- *
- * Copyright (C) 2009 IBM, Corp.
- * Copyright (C) 2009 Red Hat, Inc.
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- */
-#include "qemu/osdep.h"
-#include "block/aio.h"
-#include "block/raw-aio.h"
-
-void luring_detach_aio_context(LuringState *s, AioContext *old_context)
-{
-    abort();
-}
-
-void luring_attach_aio_context(LuringState *s, AioContext *new_context)
-{
-    abort();
-}
-
-LuringState *luring_init(Error **errp)
-{
-    abort();
-}
-
-void luring_cleanup(LuringState *s)
-{
-    abort();
-}
--- a/stubs/meson.build
+++ b/stubs/meson.build
@ -32,9 +32,6 @@ if have_block or have_ga
  stub_ss.add(files('cpus-virtual-clock.c'))
  stub_ss.add(files('icount.c'))
  stub_ss.add(files('graph-lock.c'))
-  if linux_io_uring.found()
-    stub_ss.add(files('io_uring.c'))
-  endif
  if libaio.found()
    stub_ss.add(files('linux-aio.c'))
  endif
--- a/tests/qemu-iotests/024
+++ b/tests/qemu-iotests/024
@ -315,6 +315,52 @@ echo

 $QEMU_IMG map "$OVERLAY" | _filter_qemu_img_map

+# Check that the region to copy to the overlay during a rebase
+# operation does not exceed the I/O buffer size.
+#
+# backing_new <-- backing_old <-- overlay
+#
+# Backing (new): -- -- -- --    <-- Empty image, size 4MB
+# Backing (old):|--|ff|ff|--|   <-- 4 clusters, 1MB each
+# Overlay:      |-- --|-- --|   <-- 2 clusters, 2MB each
+#
+# The data at [1MB, 3MB) must be copied from the old backing image to
+# the overlay. However the rebase code will extend that region to the
+# overlay's (sub)cluster boundaries to avoid CoW (see commit 12df580b).
+# This test checks that IO_BUF_SIZE (2 MB) is taken into account.
+
+echo
+echo "=== Test that the region to copy does not exceed 2MB (IO_BUF_SIZE) ==="
+echo
+
+echo "Creating backing chain"
+echo
+
+TEST_IMG=$BASE_NEW _make_test_img 4M
+TEST_IMG=$BASE_OLD CLUSTER_SIZE=1M _make_test_img -b "$BASE_NEW" -F $IMGFMT
+TEST_IMG=$OVERLAY  CLUSTER_SIZE=2M _make_test_img -b "$BASE_OLD" -F $IMGFMT
+
+echo
+echo "Writing data to region [1MB, 3MB)"
+echo
+
+$QEMU_IO "$BASE_OLD" -c "write -P 0xff 1M 2M" | _filter_qemu_io
+
+echo
+echo "Rebasing"
+echo
+
+$QEMU_IMG rebase -b "$BASE_NEW" -F $IMGFMT "$OVERLAY"
+
+echo "Verifying the data"
+echo
+
+$QEMU_IO "$OVERLAY" -c "read -P 0x00  0 1M" | _filter_qemu_io
+$QEMU_IO "$OVERLAY" -c "read -P 0xff 1M 2M" | _filter_qemu_io
+$QEMU_IO "$OVERLAY" -c "read -P 0x00 3M 1M" | _filter_qemu_io
+
+$QEMU_IMG map "$OVERLAY" | _filter_qemu_img_map
+
 echo

 # success, all done
--- a/tests/qemu-iotests/024.out
+++ b/tests/qemu-iotests/024.out
@ -243,4 +243,30 @@ Offset          Length          File
 0               0x20000         TEST_DIR/subdir/t.IMGFMT
 0x40000         0x20000         TEST_DIR/subdir/t.IMGFMT

+=== Test that the region to copy does not exceed 2MB (IO_BUF_SIZE) ===
+
+Creating backing chain
+
+Formatting 'TEST_DIR/subdir/t.IMGFMT.base_new', fmt=IMGFMT size=4194304
+Formatting 'TEST_DIR/subdir/t.IMGFMT.base_old', fmt=IMGFMT size=4194304 backing_file=TEST_DIR/subdir/t.IMGFMT.base_new backing_fmt=IMGFMT
+Formatting 'TEST_DIR/subdir/t.IMGFMT', fmt=IMGFMT size=4194304 backing_file=TEST_DIR/subdir/t.IMGFMT.base_old backing_fmt=IMGFMT
+
+Writing data to region [1MB, 3MB)
+
+wrote 2097152/2097152 bytes at offset 1048576
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+Rebasing
+
+Verifying the data
+
+read 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 2097152/2097152 bytes at offset 1048576
+2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1048576/1048576 bytes at offset 3145728
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Offset          Length          File
+0               0x400000        TEST_DIR/subdir/t.IMGFMT
+
 *** done
--- a/tests/qemu-iotests/184
+++ b/tests/qemu-iotests/184
@ -51,7 +51,7 @@ run_qemu()
 }

 test_throttle=$($QEMU_IMG --help|grep throttle)
-[ "$test_throttle" = "" ] && _supported_fmt throttle
+[ "$test_throttle" = "" ] && _notrun "qemu-img does not support throttle"

 echo
 echo "== checking interface =="
--- a/tests/qemu-iotests/257
+++ b/tests/qemu-iotests/257
@ -310,14 +310,18 @@ def test_bitmap_sync(bsync_mode, msync_mode='bitmap', failure=None):
                    'state': 1,
                    'new_state': 2
                }, {
-                    'event': 'read_aio',
+                    'event': 'flush_to_disk',
                    'state': 2,
                    'new_state': 3
+                }, {
+                    'event': "read_aio",
+                    'state': 3,
+                    'new_state': 4
                }],
                'inject-error': [{
                    'event': 'read_aio',
                    'errno': 5,
-                    'state': 3,
+                    'state': 4,
                    'immediately': False,
                    'once': True
                }]
--- a/tests/qemu-iotests/257.out
+++ b/tests/qemu-iotests/257.out
@ -272,7 +272,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK!

 --- Preparing image & VM ---

-{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}}
+{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}}
 {"return": {}}

 --- Write #0 ---
@ -1017,7 +1017,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK!

 --- Preparing image & VM ---

-{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}}
+{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}}
 {"return": {}}

 --- Write #0 ---
@ -1762,7 +1762,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK!

 --- Preparing image & VM ---

-{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}}
+{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}}
 {"return": {}}

 --- Write #0 ---
@ -2507,7 +2507,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK!

 --- Preparing image & VM ---

-{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}}
+{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}}
 {"return": {}}

 --- Write #0 ---
@ -3252,7 +3252,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK!

 --- Preparing image & VM ---

-{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}}
+{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}}
 {"return": {}}

 --- Write #0 ---
@ -3997,7 +3997,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK!

 --- Preparing image & VM ---

-{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}}
+{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}}
 {"return": {}}

 --- Write #0 ---
@ -4742,7 +4742,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK!

 --- Preparing image & VM ---

-{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}}
+{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}}
 {"return": {}}

 --- Write #0 ---
--- a/tests/qemu-iotests/check
+++ b/tests/qemu-iotests/check
@ -17,6 +17,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

 import os
+import re
 import sys
 import argparse
 import shutil
@ -82,7 +83,7 @@ def make_argparser() -> argparse.ArgumentParser:
    g_env.add_argument('-i', dest='aiomode', default='threads',
                       help='sets AIOMODE environment variable')

-    p.set_defaults(imgfmt='raw', imgproto='file')
+    p.set_defaults(imgproto='file')

    format_list = ['raw', 'bochs', 'cloop', 'parallels', 'qcow', 'qcow2',
                   'qed', 'vdi', 'vpc', 'vhdx', 'vmdk', 'luks', 'dmg', 'vvfat']
@ -137,15 +138,50 @@ def make_argparser() -> argparse.ArgumentParser:
    return p


+def dry_run_list(test_dir, imgfmt, testlist):
+    for t in testlist:
+        if not imgfmt:
+            print('\n'.join([os.path.basename(t)]))
+            continue
+        # If a format has been given, we look for the "supported_fmt"
+        # and the "unsupported_fmt" lines in the test and try to find out
+        # whether the format is supported or not. This is only heuristics
+        # (it can e.g. fail if the "unsupported_fmts" and "supported_fmts"
+        # statements are in the same line), but it should be good enough
+        # to get a proper list for "make check-block"
+        with open(os.path.join(test_dir, t), 'r', encoding='utf-8') as fh:
+            supported = True
+            check_next_line = False
+            sd = "[ \t'\"]"           # Start delimiter
+            ed = "([ \t'\"]|$)"       # End delimiter
+            for line in fh:
+                if 'unsupported_fmt' in line:
+                    if re.search(sd + imgfmt + ed, line):
+                        supported = False
+                        break
+                elif 'supported_fmt' in line or check_next_line:
+                    if re.search(sd + 'generic' + ed, line):
+                        continue      # Might be followed by "unsupported" line
+                    supported = re.search(sd + imgfmt + ed, line)
+                    check_next_line = not ']' in line and \
+                            ('supported_fmts=[' in line or check_next_line)
+                    if supported or not check_next_line:
+                        break
+            if supported:
+                print('\n'.join([os.path.basename(t)]))
+
+
 if __name__ == '__main__':
    warnings.simplefilter("default")
    os.environ["PYTHONWARNINGS"] = "default"

    args = make_argparser().parse_args()

+    image_format = args.imgfmt or 'raw'
+
    env = TestEnv(source_dir=args.source_dir,
                  build_dir=args.build_dir,
-                  imgfmt=args.imgfmt, imgproto=args.imgproto,
+                  imgfmt=image_format, imgproto=args.imgproto,
                  aiomode=args.aiomode, cachemode=args.cachemode,
                  imgopts=args.imgopts, misalign=args.misalign,
                  debug=args.debug, valgrind=args.valgrind,
@ -189,7 +225,7 @@ if __name__ == '__main__':

    if args.dry_run:
        with env:
-            print('\n'.join([os.path.basename(t) for t in tests]))
+            dry_run_list(env.source_iotests, args.imgfmt, tests)
    else:
        with TestRunner(env, tap=args.tap,
                        color=args.color) as tr:
--- a/tests/qemu-iotests/meson.build
+++ b/tests/qemu-iotests/meson.build
@ -2,14 +2,6 @@ if not have_tools or host_os == 'windows'
  subdir_done()
 endif

-foreach cflag: qemu_ldflags
-  if cflag.startswith('-fsanitize') and \
-     not cflag.contains('safe-stack') and not cflag.contains('cfi-icall')
-    message('Sanitizers are enabled ==> Disabled the qemu-iotests.')
-    subdir_done()
-  endif
-endforeach
-
 bash = find_program('bash', required: false, version: '>= 4.0')
 if not bash.found()
  message('bash >= v4.0 not available ==> Disabled the qemu-iotests.')
@ -21,7 +13,10 @@ qemu_iotests_env = {'PYTHON': python.full_path()}
 qemu_iotests_formats = {
  'qcow2': 'quick',
  'raw': 'slow',
+  'parallels': 'thorough',
  'qed': 'thorough',
+  'vdi': 'thorough',
+  'vhdx': 'thorough',
  'vmdk': 'thorough',
  'vpc': 'thorough'
 }
--- a/tests/qemu-iotests/testrunner.py
+++ b/tests/qemu-iotests/testrunner.py
@ -263,10 +263,21 @@ class TestRunner(contextlib.AbstractContextManager['TestRunner']):
            Path(env[d]).mkdir(parents=True, exist_ok=True)

        test_dir = env['TEST_DIR']
+        f_asan = Path(test_dir, f_test.name + '.out.asan')
        f_bad = Path(test_dir, f_test.name + '.out.bad')
        f_notrun = Path(test_dir, f_test.name + '.notrun')
        f_casenotrun = Path(test_dir, f_test.name + '.casenotrun')

+        env['ASAN_OPTIONS'] = f'detect_leaks=0:log_path={f_asan}'
+
+        def unlink_asan():
+            with os.scandir(test_dir) as it:
+                for entry in it:
+                    if entry.name.startswith(f_asan.name):
+                        os.unlink(entry)
+
+        unlink_asan()
+
        for p in (f_notrun, f_casenotrun):
            silent_unlink(p)

@ -312,6 +323,7 @@ class TestRunner(contextlib.AbstractContextManager['TestRunner']):
                              description=f'output mismatch (see {f_bad})',
                              diff=diff, casenotrun=casenotrun)
        else:
+            unlink_asan()
            f_bad.unlink()
            return TestResult(status='pass', elapsed=elapsed,
                              casenotrun=casenotrun)
--- a/tests/qemu-iotests/tests/resize-below-raw
+++ b/tests/qemu-iotests/tests/resize-below-raw
@ -8,16 +8,27 @@
 # SPDX-License-Identifier: GPL-2.0-or-later

 import os
+from typing import Dict, Optional
+
 import iotests
 from iotests import imgfmt, qemu_img_create, QMPTestCase

 image_size = 1 * 1024 * 1024
 image = os.path.join(iotests.test_dir, 'test.img')

-class TestResizeBelowRaw(QMPTestCase):
+class BaseResizeBelowRaw(QMPTestCase):
+    raw_size: Optional[int] = None
+    raw_offset: Optional[int] = None
+
    def setUp(self) -> None:
        qemu_img_create('-f', imgfmt, image, str(image_size))

+        extra_options: Dict[str, str] = {}
+        if self.raw_size is not None:
+            extra_options['size'] = str(self.raw_size)
+        if self.raw_offset is not None:
+            extra_options['offset'] = str(self.raw_offset)
+
        self.vm = iotests.VM()
        self.vm.add_blockdev(self.vm.qmp_to_opts({
            'driver': imgfmt,
@ -26,7 +37,8 @@ class TestResizeBelowRaw(QMPTestCase):
                'driver': 'file',
                'filename': image,
                'node-name': 'file0',
-            }
+            },
+            **extra_options
        }))
        self.vm.launch()

@ -34,14 +46,16 @@ class TestResizeBelowRaw(QMPTestCase):
        self.vm.shutdown()
        os.remove(image)

-    def assert_size(self, size: int) -> None:
+    def assert_size(self, size: int, file_size: Optional[int] = None) -> None:
        nodes = self.vm.qmp('query-named-block-nodes', flat=True)['return']
        self.assertEqual(len(nodes), 2)
        for node in nodes:
-            if node['drv'] == 'file':
+            if node['drv'] == 'file' and file_size is not None:
+                self.assertEqual(node['image']['virtual-size'], file_size)
                continue
            self.assertEqual(node['image']['virtual-size'], size)

+class TestResizeBelowUnlimitedRaw(BaseResizeBelowRaw):
    def test_resize_below_raw(self) -> None:
        self.assert_size(image_size)
        self.vm.qmp('block_resize', node_name='file0', size=2*image_size)
@ -49,5 +63,36 @@ class TestResizeBelowRaw(QMPTestCase):
        self.vm.qmp('block_resize', node_name='node0', size=3*image_size)
        self.assert_size(3*image_size)

+# offset = 0 behaves the same as absent offset
+class TestResizeBelowRawWithZeroOffset(TestResizeBelowUnlimitedRaw):
+    raw_offset = 0
+
+class TestResizeBelowRawWithSize(BaseResizeBelowRaw):
+    raw_size = image_size // 2
+
+    def test_resize_below_raw_with_size(self) -> None:
+        self.assert_size(image_size // 2, image_size)
+
+        # This QMP command fails because node0 unshares RESIZE
+        self.vm.qmp('block_resize', node_name='file0', size=2*image_size)
+        self.assert_size(image_size // 2, image_size)
+
+        # This QMP command fails because node0 is a fixed-size disk
+        self.vm.qmp('block_resize', node_name='node0', size=3*image_size)
+        self.assert_size(image_size // 2, image_size)
+
+class TestResizeBelowRawWithOffset(BaseResizeBelowRaw):
+    raw_offset = image_size // 4
+
+    def test_resize_below_raw_with_offset(self) -> None:
+        self.assert_size(image_size * 3 // 4, image_size)
+
+        # This QMP command fails because node0 unshares RESIZE
+        self.vm.qmp('block_resize', node_name='file0', size=2*image_size)
+        self.assert_size(image_size * 3 // 4, image_size)
+
+        self.vm.qmp('block_resize', node_name='node0', size=3*image_size)
+        self.assert_size(3 * image_size, image_size * 13 // 4)
+
 if __name__ == '__main__':
    iotests.main(supported_fmts=['raw'], supported_protocols=['file'])
--- a/tests/qemu-iotests/tests/resize-below-raw.out
+++ b/tests/qemu-iotests/tests/resize-below-raw.out
@ -1,5 +1,5 @@
-.
+....
 ----------------------------------------------------------------------
-Ran 1 tests
+Ran 4 tests

 OK
--- a/tests/unit/test-aio.c
+++ b/tests/unit/test-aio.c
@ -527,7 +527,12 @@ static void test_source_bh_delete_from_cb(void)
    g_assert_cmpint(data1.n, ==, data1.max);
    g_assert(data1.bh == NULL);

-    assert(g_main_context_iteration(NULL, false));
+    /*
+     * There may be up to one more iteration due to the aio_notify
+     * EventNotifier.
+     */
+    g_main_context_iteration(NULL, false);
+
    assert(!g_main_context_iteration(NULL, false));
 }

--- a/tests/unit/test-nested-aio-poll.c
+++ b/tests/unit/test-nested-aio-poll.c
@ -15,6 +15,7 @@
 #include "qemu/osdep.h"
 #include "block/aio.h"
 #include "qapi/error.h"
+#include "util/aio-posix.h"

 typedef struct {
    AioContext *ctx;
@ -71,17 +72,17 @@ static void test(void)
        .ctx = aio_context_new(&error_abort),
    };

+    if (td.ctx->fdmon_ops != &fdmon_poll_ops) {
+        /* This test is tied to fdmon-poll.c */
+        g_test_skip("fdmon_poll_ops not in use");
+        return;
+    }
+
    qemu_set_current_aio_context(td.ctx);

    /* Enable polling */
    aio_context_set_poll_params(td.ctx, 1000000, 2, 2, &error_abort);

-    /*
-     * The GSource is unused but this has the side-effect of changing the fdmon
-     * that AioContext uses.
-     */
-    aio_get_g_source(td.ctx);
-
    /* Make the event notifier active (set) right away */
    event_notifier_init(&td.poll_notifier, 1);
    aio_set_event_notifier(td.ctx, &td.poll_notifier,
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@ -16,6 +16,7 @@
 #include "qemu/osdep.h"
 #include "block/block.h"
 #include "block/thread-pool.h"
+#include "qapi/error.h"
 #include "qemu/main-loop.h"
 #include "qemu/lockcnt.h"
 #include "qemu/rcu.h"
@ -70,15 +71,6 @@ static AioHandler *find_aio_handler(AioContext *ctx, int fd)

 static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
 {
-    /* If the GSource is in the process of being destroyed then
-     * g_source_remove_poll() causes an assertion failure.  Skip
-     * removal in that case, because glib cleans up its state during
-     * destruction anyway.
-     */
-    if (!g_source_is_destroyed(&ctx->source)) {
-        g_source_remove_poll(&ctx->source, &node->pfd);
-    }
-
    node->pfd.revents = 0;
    node->poll_ready = false;

@ -153,7 +145,6 @@ void aio_set_fd_handler(AioContext *ctx,
        } else {
            new_node->pfd = node->pfd;
        }
-        g_source_add_poll(&ctx->source, &new_node->pfd);

        new_node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
        new_node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
@ -267,37 +258,13 @@ bool aio_prepare(AioContext *ctx)
    poll_set_started(ctx, &ready_list, false);
    /* TODO what to do with this list? */

+    ctx->fdmon_ops->gsource_prepare(ctx);
    return false;
 }

 bool aio_pending(AioContext *ctx)
 {
-    AioHandler *node;
-    bool result = false;
-
-    /*
-     * We have to walk very carefully in case aio_set_fd_handler is
-     * called while we're walking.
-     */
-    qemu_lockcnt_inc(&ctx->list_lock);
-
-    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
-        int revents;
-
-        /* TODO should this check poll ready? */
-        revents = node->pfd.revents & node->pfd.events;
-        if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read) {
-            result = true;
-            break;
-        }
-        if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write) {
-            result = true;
-            break;
-        }
-    }
-    qemu_lockcnt_dec(&ctx->list_lock);
-
-    return result;
+    return ctx->fdmon_ops->gsource_check(ctx);
 }

 static void aio_free_deleted_handlers(AioContext *ctx)
@ -390,10 +357,6 @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
    return progress;
 }

-/*
- * If we have a list of ready handlers then this is more efficient than
- * scanning all handlers with aio_dispatch_handlers().
- */
 static bool aio_dispatch_ready_handlers(AioContext *ctx,
                                        AioHandlerList *ready_list,
                                        int64_t block_ns)
@ -417,24 +380,23 @@ static bool aio_dispatch_ready_handlers(AioContext *ctx,
    return progress;
 }

-/* Slower than aio_dispatch_ready_handlers() but only used via glib */
-static bool aio_dispatch_handlers(AioContext *ctx)
-{
-    AioHandler *node, *tmp;
-    bool progress = false;
-
-    QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
-        progress = aio_dispatch_handler(ctx, node) || progress;
-    }
-
-    return progress;
-}
-
 void aio_dispatch(AioContext *ctx)
 {
+    AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
+
    qemu_lockcnt_inc(&ctx->list_lock);
+
    aio_bh_poll(ctx);
-    aio_dispatch_handlers(ctx);
+
+    ctx->fdmon_ops->gsource_dispatch(ctx, &ready_list);
+
+    if (ctx->fdmon_ops->dispatch) {
+        ctx->fdmon_ops->dispatch(ctx);
+    }
+
+    /* block_ns is 0 because polling is disabled in the glib event loop */
+    aio_dispatch_ready_handlers(ctx, &ready_list, 0);
+
    aio_free_deleted_handlers(ctx);
    qemu_lockcnt_dec(&ctx->list_lock);

@ -559,7 +521,14 @@ static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list,
        elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
        max_ns = qemu_soonest_timeout(*timeout, max_ns);
        assert(!(max_ns && progress));
-    } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
+
+        if (ctx->fdmon_ops->need_wait(ctx)) {
+            if (fdmon_supports_polling(ctx)) {
+                *timeout = 0; /* stay in polling mode */
+            }
+            break;
+        }
+    } while (elapsed_time < max_ns);

    if (remove_idle_poll_handlers(ctx, ready_list,
                                  start_time + elapsed_time)) {
@ -722,7 +691,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
         * up IO threads when some work becomes pending. It is essential to
         * avoid hangs or unnecessary latency.
         */
-        if (poll_set_started(ctx, &ready_list, false)) {
+        if (timeout && poll_set_started(ctx, &ready_list, false)) {
            timeout = 0;
            progress = true;
        }
@ -743,6 +712,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
        block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
    }

+    if (ctx->fdmon_ops->dispatch) {
+        progress |= ctx->fdmon_ops->dispatch(ctx);
+    }
+
    progress |= aio_bh_poll(ctx);
    progress |= aio_dispatch_ready_handlers(ctx, &ready_list, block_ns);

@ -755,35 +728,50 @@ bool aio_poll(AioContext *ctx, bool blocking)
    return progress;
 }

-void aio_context_setup(AioContext *ctx)
+bool aio_context_setup(AioContext *ctx, Error **errp)
 {
    ctx->fdmon_ops = &fdmon_poll_ops;
    ctx->epollfd = -1;
+    ctx->epollfd_tag = NULL;

-    /* Use the fastest fd monitoring implementation if available */
-    if (fdmon_io_uring_setup(ctx)) {
-        return;
+#ifdef CONFIG_LINUX_IO_URING
+    {
+        static bool need_io_uring;
+        Error *local_err = NULL; /* ERRP_GUARD() doesn't handle error_abort */
+
+        /* io_uring takes precedence because it provides aio_add_sqe() support */
+        if (fdmon_io_uring_setup(ctx, &local_err)) {
+            /*
+             * If one AioContext gets io_uring, then all AioContexts need io_uring
+             * so that aio_add_sqe() support is available across all threads.
+             */
+            need_io_uring = true;
+            return true;
+        }
+        if (need_io_uring) {
+            error_propagate(errp, local_err);
+            return false;
+        }
+
+        /* Silently fall back on systems where io_uring is unavailable */
+        error_free(local_err);
    }
+#endif /* CONFIG_LINUX_IO_URING */

    fdmon_epoll_setup(ctx);
+    return true;
 }

 void aio_context_destroy(AioContext *ctx)
 {
+#ifdef CONFIG_LINUX_IO_URING
    fdmon_io_uring_destroy(ctx);
-    fdmon_epoll_disable(ctx);
-    aio_free_deleted_handlers(ctx);
-}
+#endif
+
+    qemu_lockcnt_lock(&ctx->list_lock);
+    fdmon_epoll_disable(ctx);
+    qemu_lockcnt_unlock(&ctx->list_lock);

-void aio_context_use_g_source(AioContext *ctx)
-{
-    /*
-     * Disable io_uring when the glib main loop is used because it doesn't
-     * support mixed glib/aio_poll() usage. It relies on aio_poll() being
-     * called regularly so that changes to the monitored file descriptors are
-     * submitted, otherwise a list of pending fd handlers builds up.
-     */
-    fdmon_io_uring_destroy(ctx);
    aio_free_deleted_handlers(ctx);
 }

@ -818,3 +806,12 @@ void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch)

    aio_notify(ctx);
 }
+
+#ifdef CONFIG_LINUX_IO_URING
+void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
+                 void *opaque, CqeHandler *cqe_handler)
+{
+    AioContext *ctx = qemu_get_current_aio_context();
+    ctx->fdmon_ops->add_sqe(ctx, prep_sqe, opaque, cqe_handler);
+}
+#endif /* CONFIG_LINUX_IO_URING */
--- a/util/aio-posix.h
+++ b/util/aio-posix.h
@ -18,6 +18,7 @@
 #define AIO_POSIX_H

 #include "block/aio.h"
+#include "qapi/error.h"

 struct AioHandler {
    GPollFD pfd;
@ -35,6 +36,7 @@ struct AioHandler {
 #ifdef CONFIG_LINUX_IO_URING
    QSLIST_ENTRY(AioHandler) node_submitted;
    unsigned flags; /* see fdmon-io_uring.c */
+    CqeHandler internal_cqe_handler; /* used for POLL_ADD/POLL_REMOVE */
 #endif
    int64_t poll_idle_timeout; /* when to stop userspace polling */
    bool poll_ready; /* has polling detected an event? */
@ -47,9 +49,14 @@ void aio_add_ready_handler(AioHandlerList *ready_list, AioHandler *node,

 extern const FDMonOps fdmon_poll_ops;

+/* Switch back to poll(2). list_lock must be held. */
+void fdmon_poll_downgrade(AioContext *ctx);
+
 #ifdef CONFIG_EPOLL_CREATE1
 bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd);
 void fdmon_epoll_setup(AioContext *ctx);
+
+/* list_lock must be held */
 void fdmon_epoll_disable(AioContext *ctx);
 #else
 static inline bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
@ -67,17 +74,8 @@ static inline void fdmon_epoll_disable(AioContext *ctx)
 #endif /* !CONFIG_EPOLL_CREATE1 */

 #ifdef CONFIG_LINUX_IO_URING
-bool fdmon_io_uring_setup(AioContext *ctx);
+bool fdmon_io_uring_setup(AioContext *ctx, Error **errp);
 void fdmon_io_uring_destroy(AioContext *ctx);
-#else
-static inline bool fdmon_io_uring_setup(AioContext *ctx)
-{
-    return false;
-}
-
-static inline void fdmon_io_uring_destroy(AioContext *ctx)
-{
-}
 #endif /* !CONFIG_LINUX_IO_URING */

 #endif /* AIO_POSIX_H */
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@ -419,18 +419,15 @@ bool aio_poll(AioContext *ctx, bool blocking)
    return progress;
 }

-void aio_context_setup(AioContext *ctx)
+bool aio_context_setup(AioContext *ctx, Error **errp)
 {
+    return true;
 }

 void aio_context_destroy(AioContext *ctx)
 {
 }

-void aio_context_use_g_source(AioContext *ctx)
-{
-}
-
 void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
                                 int64_t grow, int64_t shrink, Error **errp)
 {
--- a/util/async.c
+++ b/util/async.c
@ -366,12 +366,16 @@ aio_ctx_dispatch(GSource     *source,
 }

 static void
-aio_ctx_finalize(GSource     *source)
+aio_ctx_finalize(GSource *source)
 {
    AioContext *ctx = (AioContext *) source;
    QEMUBH *bh;
    unsigned flags;

+    if (!ctx->initialized) {
+        return;
+    }
+
    thread_pool_free_aio(ctx->thread_pool);

 #ifdef CONFIG_LINUX_AIO
@ -382,14 +386,6 @@ aio_ctx_finalize(GSource     *source)
    }
 #endif

-#ifdef CONFIG_LINUX_IO_URING
-    if (ctx->linux_io_uring) {
-        luring_detach_aio_context(ctx->linux_io_uring, ctx);
-        luring_cleanup(ctx->linux_io_uring);
-        ctx->linux_io_uring = NULL;
-    }
-#endif
-
    assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
    qemu_bh_delete(ctx->co_schedule_bh);

@ -418,10 +414,11 @@ aio_ctx_finalize(GSource     *source)
    aio_set_event_notifier(ctx, &ctx->notifier, NULL, NULL, NULL);
    event_notifier_cleanup(&ctx->notifier);
    qemu_rec_mutex_destroy(&ctx->lock);
-    qemu_lockcnt_destroy(&ctx->list_lock);
    timerlistgroup_deinit(&ctx->tlg);
    unregister_aiocontext(ctx);
    aio_context_destroy(ctx);
+    /* aio_context_destroy() still needs the lock */
+    qemu_lockcnt_destroy(&ctx->list_lock);
 }

 static GSourceFuncs aio_source_funcs = {
@ -433,7 +430,6 @@ static GSourceFuncs aio_source_funcs = {

 GSource *aio_get_g_source(AioContext *ctx)
 {
-    aio_context_use_g_source(ctx);
    g_source_ref(&ctx->source);
    return &ctx->source;
 }
@ -465,29 +461,6 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx)
 }
 #endif

-#ifdef CONFIG_LINUX_IO_URING
-LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp)
-{
-    if (ctx->linux_io_uring) {
-        return ctx->linux_io_uring;
-    }
-
-    ctx->linux_io_uring = luring_init(errp);
-    if (!ctx->linux_io_uring) {
-        return NULL;
-    }
-
-    luring_attach_aio_context(ctx->linux_io_uring, ctx);
-    return ctx->linux_io_uring;
-}
-
-LuringState *aio_get_linux_io_uring(AioContext *ctx)
-{
-    assert(ctx->linux_io_uring);
-    return ctx->linux_io_uring;
-}
-#endif
-
 void aio_notify(AioContext *ctx)
 {
    /*
@ -577,19 +550,42 @@ static void co_schedule_bh_cb(void *opaque)

 AioContext *aio_context_new(Error **errp)
 {
+    ERRP_GUARD();
    int ret;
    AioContext *ctx;

+    /*
+     * ctx is freed by g_source_unref() (e.g. aio_context_unref()). ctx's
+     * resources are freed as follows:
+     *
+     * 1. By aio_ctx_finalize() after aio_context_new() has returned and set
+     *    ->initialized = true.
+     *
+     * 2. By manual cleanup code in this function's error paths before goto
+     *    fail.
+     *
+     * Be careful to free resources in both cases!
+     */
    ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext));
    QSLIST_INIT(&ctx->bh_list);
    QSIMPLEQ_INIT(&ctx->bh_slice_list);
-    aio_context_setup(ctx);

    ret = event_notifier_init(&ctx->notifier, false);
    if (ret < 0) {
        error_setg_errno(errp, -ret, "Failed to initialize event notifier");
        goto fail;
    }
+
+    /*
+     * Resources cannot easily be freed manually after aio_context_setup(). If
+     * you add any new resources to AioContext, it's probably best to acquire
+     * them before aio_context_setup().
+     */
+    if (!aio_context_setup(ctx, errp)) {
+        event_notifier_cleanup(&ctx->notifier);
+        goto fail;
+    }
+
    g_source_set_can_recurse(&ctx->source, true);
    qemu_lockcnt_init(&ctx->list_lock);

@ -604,10 +600,6 @@ AioContext *aio_context_new(Error **errp)
    ctx->linux_aio = NULL;
 #endif

-#ifdef CONFIG_LINUX_IO_URING
-    ctx->linux_io_uring = NULL;
-#endif
-
    ctx->thread_pool = NULL;
    qemu_rec_mutex_init(&ctx->lock);
    timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
@ -623,9 +615,11 @@ AioContext *aio_context_new(Error **errp)

    register_aiocontext(ctx);

+    ctx->initialized = true;
+
    return ctx;
 fail:
-    g_source_destroy(&ctx->source);
+    g_source_unref(&ctx->source);
    return NULL;
 }

--- a/util/fdmon-epoll.c
+++ b/util/fdmon-epoll.c
@ -19,8 +19,12 @@ void fdmon_epoll_disable(AioContext *ctx)
        ctx->epollfd = -1;
    }

-    /* Switch back */
-    ctx->fdmon_ops = &fdmon_poll_ops;
+    if (ctx->epollfd_tag) {
+        g_source_remove_unix_fd(&ctx->source, ctx->epollfd_tag);
+        ctx->epollfd_tag = NULL;
+    }
+
+    fdmon_poll_downgrade(ctx);
 }

 static inline int epoll_events_from_pfd(int pfd_events)
@ -93,10 +97,29 @@ out:
    return ret;
 }

+static void fdmon_epoll_gsource_prepare(AioContext *ctx)
+{
+    /* Do nothing */
+}
+
+static bool fdmon_epoll_gsource_check(AioContext *ctx)
+{
+    return g_source_query_unix_fd(&ctx->source, ctx->epollfd_tag) & G_IO_IN;
+}
+
+static void fdmon_epoll_gsource_dispatch(AioContext *ctx,
+                                         AioHandlerList *ready_list)
+{
+    fdmon_epoll_wait(ctx, ready_list, 0);
+}
+
 static const FDMonOps fdmon_epoll_ops = {
    .update = fdmon_epoll_update,
    .wait = fdmon_epoll_wait,
    .need_wait = aio_poll_disabled,
+    .gsource_prepare = fdmon_epoll_gsource_prepare,
+    .gsource_check = fdmon_epoll_gsource_check,
+    .gsource_dispatch = fdmon_epoll_gsource_dispatch,
 };

 static bool fdmon_epoll_try_enable(AioContext *ctx)
@ -118,6 +141,8 @@ static bool fdmon_epoll_try_enable(AioContext *ctx)
    }

    ctx->fdmon_ops = &fdmon_epoll_ops;
+    ctx->epollfd_tag = g_source_add_unix_fd(&ctx->source, ctx->epollfd,
+                                            G_IO_IN);
    return true;
 }

@ -139,12 +164,11 @@ bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
    }

    ok = fdmon_epoll_try_enable(ctx);
-
-    qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
-
    if (!ok) {
        fdmon_epoll_disable(ctx);
    }
+
+    qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
    return ok;
 }

--- a/util/fdmon-io_uring.c
+++ b/util/fdmon-io_uring.c
@ -45,16 +45,20 @@

 #include "qemu/osdep.h"
 #include <poll.h>
+#include "qapi/error.h"
+#include "qemu/defer-call.h"
 #include "qemu/rcu_queue.h"
 #include "aio-posix.h"
+#include "trace.h"

 enum {
    FDMON_IO_URING_ENTRIES  = 128, /* sq/cq ring size */

    /* AioHandler::flags */
-    FDMON_IO_URING_PENDING  = (1 << 0),
-    FDMON_IO_URING_ADD      = (1 << 1),
-    FDMON_IO_URING_REMOVE   = (1 << 2),
+    FDMON_IO_URING_PENDING            = (1 << 0),
+    FDMON_IO_URING_ADD                = (1 << 1),
+    FDMON_IO_URING_REMOVE             = (1 << 2),
+    FDMON_IO_URING_DELETE_AIO_HANDLER = (1 << 3),
 };

 static inline int poll_events_from_pfd(int pfd_events)
@ -74,8 +78,8 @@ static inline int pfd_events_from_poll(int poll_events)
 }

 /*
- * Returns an sqe for submitting a request.  Only be called within
- * fdmon_io_uring_wait().
+ * Returns an sqe for submitting a request. Only called from the AioContext
+ * thread.
 */
 static struct io_uring_sqe *get_sqe(AioContext *ctx)
 {
@ -166,41 +170,50 @@ static void fdmon_io_uring_update(AioContext *ctx,
    }
 }

+static void fdmon_io_uring_add_sqe(AioContext *ctx,
+        void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
+        void *opaque, CqeHandler *cqe_handler)
+{
+    struct io_uring_sqe *sqe = get_sqe(ctx);
+
+    prep_sqe(sqe, opaque);
+    io_uring_sqe_set_data(sqe, cqe_handler);
+
+    trace_fdmon_io_uring_add_sqe(ctx, opaque, sqe->opcode, sqe->fd, sqe->off,
+                                 cqe_handler);
+}
+
+static void fdmon_special_cqe_handler(CqeHandler *cqe_handler)
+{
+    /*
+     * This is an empty function that is never called. It is used as a function
+     * pointer to distinguish it from ordinary cqe handlers.
+     */
+}
+
 static void add_poll_add_sqe(AioContext *ctx, AioHandler *node)
 {
    struct io_uring_sqe *sqe = get_sqe(ctx);
    int events = poll_events_from_pfd(node->pfd.events);

    io_uring_prep_poll_add(sqe, node->pfd.fd, events);
-    io_uring_sqe_set_data(sqe, node);
+    node->internal_cqe_handler.cb = fdmon_special_cqe_handler;
+    io_uring_sqe_set_data(sqe, &node->internal_cqe_handler);
 }

 static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node)
 {
    struct io_uring_sqe *sqe = get_sqe(ctx);
+    CqeHandler *cqe_handler = &node->internal_cqe_handler;

 #ifdef LIBURING_HAVE_DATA64
-    io_uring_prep_poll_remove(sqe, (uintptr_t)node);
+    io_uring_prep_poll_remove(sqe, (uintptr_t)cqe_handler);
 #else
-    io_uring_prep_poll_remove(sqe, node);
+    io_uring_prep_poll_remove(sqe, cqe_handler);
 #endif
    io_uring_sqe_set_data(sqe, NULL);
 }

-/* Add a timeout that self-cancels when another cqe becomes ready */
-static void add_timeout_sqe(AioContext *ctx, int64_t ns)
-{
-    struct io_uring_sqe *sqe;
-    struct __kernel_timespec ts = {
-        .tv_sec = ns / NANOSECONDS_PER_SECOND,
-        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
-    };
-
-    sqe = get_sqe(ctx);
-    io_uring_prep_timeout(sqe, &ts, 1, 0);
-    io_uring_sqe_set_data(sqe, NULL);
-}
-
 /* Add sqes from ctx->submit_list for submission */
 static void fill_sq_ring(AioContext *ctx)
 {
@ -218,22 +231,26 @@ static void fill_sq_ring(AioContext *ctx)
        if (flags & FDMON_IO_URING_REMOVE) {
            add_poll_remove_sqe(ctx, node);
        }
+        if (flags & FDMON_IO_URING_DELETE_AIO_HANDLER) {
+            /*
+             * process_cqe() sets this flag after ADD and REMOVE have been
+             * cleared. They cannot be set again, so they must be clear.
+             */
+            assert(!(flags & FDMON_IO_URING_ADD));
+            assert(!(flags & FDMON_IO_URING_REMOVE));
+
+            QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
+        }
    }
 }

-/* Returns true if a handler became ready */
-static bool process_cqe(AioContext *ctx,
-                        AioHandlerList *ready_list,
-                        struct io_uring_cqe *cqe)
+static bool process_cqe_aio_handler(AioContext *ctx,
+                                    AioHandlerList *ready_list,
+                                    AioHandler *node,
+                                    struct io_uring_cqe *cqe)
 {
-    AioHandler *node = io_uring_cqe_get_data(cqe);
    unsigned flags;

-    /* poll_timeout and poll_remove have a zero user_data field */
-    if (!node) {
-        return false;
-    }
-
    /*
     * Deletion can only happen when IORING_OP_POLL_ADD completes.  If we race
     * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE
@ -241,7 +258,12 @@ static bool process_cqe(AioContext *ctx,
     */
    flags = qatomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE);
    if (flags & FDMON_IO_URING_REMOVE) {
-        QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
+        if (flags & FDMON_IO_URING_PENDING) {
+            /* Still on ctx->submit_list, defer deletion until fill_sq_ring() */
+            qatomic_or(&node->flags, FDMON_IO_URING_DELETE_AIO_HANDLER);
+        } else {
+            QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
+        }
        return false;
    }

@ -252,6 +274,35 @@ static bool process_cqe(AioContext *ctx,
    return true;
 }

+/* Returns true if a handler became ready */
+static bool process_cqe(AioContext *ctx,
+                        AioHandlerList *ready_list,
+                        struct io_uring_cqe *cqe)
+{
+    CqeHandler *cqe_handler = io_uring_cqe_get_data(cqe);
+
+    /* poll_timeout and poll_remove have a zero user_data field */
+    if (!cqe_handler) {
+        return false;
+    }
+
+    /*
+     * Special handling for AioHandler cqes. They need ready_list and have a
+     * return value.
+     */
+    if (cqe_handler->cb == fdmon_special_cqe_handler) {
+        AioHandler *node = container_of(cqe_handler, AioHandler,
+                                        internal_cqe_handler);
+        return process_cqe_aio_handler(ctx, ready_list, node, cqe);
+    }
+
+    cqe_handler->cqe = *cqe;
+
+    /* Handlers are invoked later by fdmon_io_uring_dispatch() */
+    QSIMPLEQ_INSERT_TAIL(&ctx->cqe_handler_ready_list, cqe_handler, next);
+    return false;
+}
+
 static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list)
 {
    struct io_uring *ring = &ctx->fdmon_io_uring;
@ -260,6 +311,13 @@ static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list)
    unsigned num_ready = 0;
    unsigned head;

+#ifdef HAVE_IO_URING_CQ_HAS_OVERFLOW
+    /* If the CQ overflowed then fetch CQEs with a syscall */
+    if (io_uring_cq_has_overflow(ring)) {
+        io_uring_get_events(ring);
+    }
+#endif
+
    io_uring_for_each_cqe(ring, head, cqe) {
        if (process_cqe(ctx, ready_list, cqe)) {
            num_ready++;
@ -272,23 +330,91 @@ static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list)
    return num_ready;
 }

+/* This is where SQEs are submitted in the glib event loop */
+static void fdmon_io_uring_gsource_prepare(AioContext *ctx)
+{
+    fill_sq_ring(ctx);
+    if (io_uring_sq_ready(&ctx->fdmon_io_uring)) {
+        while (io_uring_submit(&ctx->fdmon_io_uring) == -EINTR) {
+            /* Keep trying if syscall was interrupted */
+        }
+    }
+}
+
+static bool fdmon_io_uring_gsource_check(AioContext *ctx)
+{
+    gpointer tag = ctx->io_uring_fd_tag;
+    return g_source_query_unix_fd(&ctx->source, tag) & G_IO_IN;
+}
+
+/* Dispatch CQE handlers that are ready */
+static bool fdmon_io_uring_dispatch(AioContext *ctx)
+{
+    CqeHandlerSimpleQ *ready_list = &ctx->cqe_handler_ready_list;
+    bool progress = false;
+
+    /* Handlers may use defer_call() to coalesce frequent operations */
+    defer_call_begin();
+
+    while (!QSIMPLEQ_EMPTY(ready_list)) {
+        CqeHandler *cqe_handler = QSIMPLEQ_FIRST(ready_list);
+
+        QSIMPLEQ_REMOVE_HEAD(ready_list, next);
+
+        trace_fdmon_io_uring_cqe_handler(ctx, cqe_handler,
+                                         cqe_handler->cqe.res);
+        cqe_handler->cb(cqe_handler);
+        progress = true;
+    }
+
+    defer_call_end();
+
+    return progress;
+}
+
+
+/* This is where CQEs are processed in the glib event loop */
+static void fdmon_io_uring_gsource_dispatch(AioContext *ctx,
+                                            AioHandlerList *ready_list)
+{
+    process_cq_ring(ctx, ready_list);
+}
+
 static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
                               int64_t timeout)
 {
+    struct __kernel_timespec ts;
    unsigned wait_nr = 1; /* block until at least one cqe is ready */
    int ret;

    if (timeout == 0) {
        wait_nr = 0; /* non-blocking */
    } else if (timeout > 0) {
-        add_timeout_sqe(ctx, timeout);
+        /* Add a timeout that self-cancels when another cqe becomes ready */
+        struct io_uring_sqe *sqe;
+
+        ts = (struct __kernel_timespec){
+            .tv_sec = timeout / NANOSECONDS_PER_SECOND,
+            .tv_nsec = timeout % NANOSECONDS_PER_SECOND,
+        };
+
+        sqe = get_sqe(ctx);
+        io_uring_prep_timeout(sqe, &ts, 1, 0);
+        io_uring_sqe_set_data(sqe, NULL);
    }

    fill_sq_ring(ctx);

+    /*
+     * Loop to handle signals in both cases:
+     * 1. If no SQEs were submitted, then -EINTR is returned.
+     * 2. If SQEs were submitted then the number of SQEs submitted is returned
+     *    rather than -EINTR.
+     */
    do {
        ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
-    } while (ret == -EINTR);
+    } while (ret == -EINTR ||
+             (ret >= 0 && wait_nr > io_uring_cq_ready(&ctx->fdmon_io_uring)));

    assert(ret >= 0);

@ -319,43 +445,66 @@ static const FDMonOps fdmon_io_uring_ops = {
    .update = fdmon_io_uring_update,
    .wait = fdmon_io_uring_wait,
    .need_wait = fdmon_io_uring_need_wait,
+    .dispatch = fdmon_io_uring_dispatch,
+    .gsource_prepare = fdmon_io_uring_gsource_prepare,
+    .gsource_check = fdmon_io_uring_gsource_check,
+    .gsource_dispatch = fdmon_io_uring_gsource_dispatch,
+    .add_sqe = fdmon_io_uring_add_sqe,
 };

-bool fdmon_io_uring_setup(AioContext *ctx)
+bool fdmon_io_uring_setup(AioContext *ctx, Error **errp)
 {
    int ret;

+    ctx->io_uring_fd_tag = NULL;
+
    ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
    if (ret != 0) {
+        error_setg_errno(errp, -ret, "Failed to initialize io_uring");
        return false;
    }

    QSLIST_INIT(&ctx->submit_list);
+    QSIMPLEQ_INIT(&ctx->cqe_handler_ready_list);
    ctx->fdmon_ops = &fdmon_io_uring_ops;
+    ctx->io_uring_fd_tag = g_source_add_unix_fd(&ctx->source,
+            ctx->fdmon_io_uring.ring_fd, G_IO_IN);
    return true;
 }

 void fdmon_io_uring_destroy(AioContext *ctx)
 {
-    if (ctx->fdmon_ops == &fdmon_io_uring_ops) {
-        AioHandler *node;
+    AioHandler *node;

-        io_uring_queue_exit(&ctx->fdmon_io_uring);
+    if (ctx->fdmon_ops != &fdmon_io_uring_ops) {
+        return;
+    }

-        /* Move handlers due to be removed onto the deleted list */
-        while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) {
-            unsigned flags = qatomic_fetch_and(&node->flags,
-                    ~(FDMON_IO_URING_PENDING |
-                      FDMON_IO_URING_ADD |
-                      FDMON_IO_URING_REMOVE));
+    io_uring_queue_exit(&ctx->fdmon_io_uring);

-            if (flags & FDMON_IO_URING_REMOVE) {
-                QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
-            }
+    /* Move handlers due to be removed onto the deleted list */
+    while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) {
+        unsigned flags = qatomic_fetch_and(&node->flags,
+                ~(FDMON_IO_URING_PENDING |
+                  FDMON_IO_URING_ADD |
+                  FDMON_IO_URING_REMOVE |
+                  FDMON_IO_URING_DELETE_AIO_HANDLER));

-            QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted);
+        if ((flags & FDMON_IO_URING_REMOVE) ||
+            (flags & FDMON_IO_URING_DELETE_AIO_HANDLER)) {
+            QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers,
+                                  node, node_deleted);
        }

-        ctx->fdmon_ops = &fdmon_poll_ops;
+        QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted);
    }
+
+    g_source_remove_unix_fd(&ctx->source, ctx->io_uring_fd_tag);
+    ctx->io_uring_fd_tag = NULL;
+
+    assert(QSIMPLEQ_EMPTY(&ctx->cqe_handler_ready_list));
+
+    qemu_lockcnt_lock(&ctx->list_lock);
+    fdmon_poll_downgrade(ctx);
+    qemu_lockcnt_unlock(&ctx->list_lock);
 }
--- a/util/fdmon-poll.c
+++ b/util/fdmon-poll.c
@ -72,6 +72,11 @@ static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list,

    /* epoll(7) is faster above a certain number of fds */
    if (fdmon_epoll_try_upgrade(ctx, npfd)) {
+        QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+            if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events) {
+                g_source_remove_poll(&ctx->source, &node->pfd);
+            }
+        }
        npfd = 0; /* we won't need pollfds[], reset npfd */
        return ctx->fdmon_ops->wait(ctx, ready_list, timeout);
    }
@ -97,11 +102,89 @@ static void fdmon_poll_update(AioContext *ctx,
                              AioHandler *old_node,
                              AioHandler *new_node)
 {
-    /* Do nothing, AioHandler already contains the state we'll need */
+    if (old_node) {
+        /*
+         * If the GSource is in the process of being destroyed then
+         * g_source_remove_poll() causes an assertion failure.  Skip removal in
+         * that case, because glib cleans up its state during destruction
+         * anyway.
+         */
+        if (!g_source_is_destroyed(&ctx->source)) {
+            g_source_remove_poll(&ctx->source, &old_node->pfd);
+        }
+    }
+
+    if (new_node) {
+        g_source_add_poll(&ctx->source, &new_node->pfd);
+    }
+}
+
+static void fdmon_poll_gsource_prepare(AioContext *ctx)
+{
+    /* Do nothing */
+}
+
+static bool fdmon_poll_gsource_check(AioContext *ctx)
+{
+    AioHandler *node;
+    bool result = false;
+
+    /*
+     * We have to walk very carefully in case aio_set_fd_handler is
+     * called while we're walking.
+     */
+    qemu_lockcnt_inc(&ctx->list_lock);
+
+    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+        int revents = node->pfd.revents & node->pfd.events;
+
+        if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read) {
+            result = true;
+            break;
+        }
+        if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write) {
+            result = true;
+            break;
+        }
+    }
+
+    qemu_lockcnt_dec(&ctx->list_lock);
+
+    return result;
+}
+
+static void fdmon_poll_gsource_dispatch(AioContext *ctx,
+                                        AioHandlerList *ready_list)
+{
+    AioHandler *node;
+
+    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+        int revents = node->pfd.revents;
+
+        if (revents) {
+            aio_add_ready_handler(ready_list, node, revents);
+        }
+    }
 }

 const FDMonOps fdmon_poll_ops = {
    .update = fdmon_poll_update,
    .wait = fdmon_poll_wait,
    .need_wait = aio_poll_disabled,
+    .gsource_prepare = fdmon_poll_gsource_prepare,
+    .gsource_check = fdmon_poll_gsource_check,
+    .gsource_dispatch = fdmon_poll_gsource_dispatch,
 };
+
+void fdmon_poll_downgrade(AioContext *ctx)
+{
+    AioHandler *node;
+
+    ctx->fdmon_ops = &fdmon_poll_ops;
+
+    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+        if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events) {
+            g_source_add_poll(&ctx->source, &node->pfd);
+        }
+    }
+}
--- a/util/trace-events
+++ b/util/trace-events
@ -24,6 +24,10 @@ buffer_move_empty(const char *buf, size_t len, const char *from) "%s: %zd bytes
 buffer_move(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s"
 buffer_free(const char *buf, size_t len) "%s: capacity %zd"

+# fdmon-io_uring.c
+fdmon_io_uring_add_sqe(void *ctx, void *opaque, int opcode, int fd, uint64_t off, void *cqe_handler) "ctx %p opaque %p opcode %d fd %d off %"PRId64" cqe_handler %p"
+fdmon_io_uring_cqe_handler(void *ctx, void *cqe_handler, int cqe_res) "ctx %p cqe_handler %p cqe_res %d"
+
 # filemonitor-inotify.c
 qemu_file_monitor_add_watch(void *mon, const char *dirpath, const char *filename, void *cb, void *opaque, int64_t id) "File monitor %p add watch dir='%s' file='%s' cb=%p opaque=%p id=%" PRId64
 qemu_file_monitor_remove_watch(void *mon, const char *dirpath, int64_t id) "File monitor %p remove watch dir='%s' id=%" PRId64