diff --git a/block.c b/block.c index cf08e64add..4f1581cedf 100644 --- a/block.c +++ b/block.c @@ -693,7 +693,7 @@ out: } int coroutine_fn bdrv_co_create_file(const char *filename, QemuOpts *opts, - Error **errp) + bool allow_protocol_prefix, Error **errp) { QemuOpts *protocol_opts; BlockDriver *drv; @@ -702,7 +702,7 @@ int coroutine_fn bdrv_co_create_file(const char *filename, QemuOpts *opts, GLOBAL_STATE_CODE(); - drv = bdrv_find_protocol(filename, true, errp); + drv = bdrv_find_protocol(filename, allow_protocol_prefix, errp); if (drv == NULL) { return -ENOENT; } @@ -5398,17 +5398,13 @@ bdrv_replace_node_noperm(BlockDriverState *from, * * With auto_skip=false the error is returned if from has a parent which should * not be updated. - * - * With @detach_subchain=true @to must be in a backing chain of @from. In this - * case backing link of the cow-parent of @to is removed. */ static int GRAPH_WRLOCK bdrv_replace_node_common(BlockDriverState *from, BlockDriverState *to, - bool auto_skip, bool detach_subchain, Error **errp) + bool auto_skip, Error **errp) { Transaction *tran = tran_new(); g_autoptr(GSList) refresh_list = NULL; - BlockDriverState *to_cow_parent = NULL; int ret; GLOBAL_STATE_CODE(); @@ -5417,17 +5413,6 @@ bdrv_replace_node_common(BlockDriverState *from, BlockDriverState *to, assert(to->quiesce_counter); assert(bdrv_get_aio_context(from) == bdrv_get_aio_context(to)); - if (detach_subchain) { - assert(bdrv_chain_contains(from, to)); - assert(from != to); - for (to_cow_parent = from; - bdrv_filter_or_cow_bs(to_cow_parent) != to; - to_cow_parent = bdrv_filter_or_cow_bs(to_cow_parent)) - { - ; - } - } - /* * Do the replacement without permission update. * Replacement may influence the permissions, we should calculate new @@ -5439,11 +5424,6 @@ bdrv_replace_node_common(BlockDriverState *from, BlockDriverState *to, goto out; } - if (detach_subchain) { - /* to_cow_parent is already drained because from is drained */ - bdrv_remove_child(bdrv_filter_or_cow_child(to_cow_parent), tran); - } - refresh_list = g_slist_prepend(refresh_list, to); refresh_list = g_slist_prepend(refresh_list, from); @@ -5462,7 +5442,7 @@ out: int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to, Error **errp) { - return bdrv_replace_node_common(from, to, true, false, errp); + return bdrv_replace_node_common(from, to, true, errp); } int bdrv_drop_filter(BlockDriverState *bs, Error **errp) @@ -5478,7 +5458,7 @@ int bdrv_drop_filter(BlockDriverState *bs, Error **errp) bdrv_drained_begin(child_bs); bdrv_graph_wrlock(); - ret = bdrv_replace_node_common(bs, child_bs, true, true, errp); + ret = bdrv_replace_node_common(bs, child_bs, true, errp); bdrv_graph_wrunlock(); bdrv_drained_end(child_bs); @@ -5929,17 +5909,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base, updated_children = g_slist_prepend(updated_children, c); } - /* - * It seems correct to pass detach_subchain=true here, but it triggers - * one more yet not fixed bug, when due to nested aio_poll loop we switch to - * another drained section, which modify the graph (for example, removing - * the child, which we keep in updated_children list). So, it's a TODO. - * - * Note, bug triggered if pass detach_subchain=true here and run - * test-bdrv-drain. test_drop_intermediate_poll() test-case will crash. - * That's a FIXME. - */ - bdrv_replace_node_common(top, base, false, false, &local_err); + bdrv_replace_node_common(top, base, false, &local_err); bdrv_graph_wrunlock(); if (local_err) { diff --git a/block/bochs.c b/block/bochs.c index b099fb52fe..bfda88017d 100644 --- a/block/bochs.c +++ b/block/bochs.c @@ -300,15 +300,15 @@ static void bochs_close(BlockDriverState *bs) } static BlockDriver bdrv_bochs = { - .format_name = "bochs", - .instance_size = sizeof(BDRVBochsState), - .bdrv_probe = bochs_probe, - .bdrv_open = bochs_open, + .format_name = "bochs", + .instance_size = sizeof(BDRVBochsState), + .bdrv_probe = bochs_probe, + .bdrv_open = bochs_open, .bdrv_child_perm = bdrv_default_perms, .bdrv_refresh_limits = bochs_refresh_limits, - .bdrv_co_preadv = bochs_co_preadv, - .bdrv_close = bochs_close, - .is_format = true, + .bdrv_co_preadv = bochs_co_preadv, + .bdrv_close = bochs_close, + .is_format = true, }; static void bdrv_bochs_init(void) diff --git a/block/crypto.c b/block/crypto.c index 7c37b23e36..b97d027444 100644 --- a/block/crypto.c +++ b/block/crypto.c @@ -835,7 +835,7 @@ block_crypto_co_create_opts_luks(BlockDriver *drv, const char *filename, } /* Create protocol layer */ - ret = bdrv_co_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, true, errp); if (ret < 0) { goto fail; } diff --git a/block/file-posix.c b/block/file-posix.c index 8c738674ce..12d12970fa 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -133,7 +133,7 @@ #define FTYPE_FILE 0 #define FTYPE_CD 1 -#define MAX_BLOCKSIZE 4096 +#define MAX_BLOCKSIZE 4096 /* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes, * leaving a few more bytes for its future use. */ @@ -755,14 +755,23 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } #endif /* !defined(CONFIG_LINUX_AIO) */ -#ifndef CONFIG_LINUX_IO_URING if (s->use_linux_io_uring) { +#ifdef CONFIG_LINUX_IO_URING + if (!aio_has_io_uring()) { + error_setg(errp, "aio=io_uring was specified, but is not " + "available (disabled via io_uring_disabled " + "sysctl or blocked by container runtime " + "seccomp policy?)"); + ret = -EINVAL; + goto fail; + } +#else error_setg(errp, "aio=io_uring was specified, but is not supported " - "in this build."); + "in this build"); ret = -EINVAL; goto fail; - } #endif /* !defined(CONFIG_LINUX_IO_URING) */ + } s->has_discard = true; s->has_write_zeroes = true; @@ -2522,27 +2531,6 @@ static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) return true; } -#ifdef CONFIG_LINUX_IO_URING -static inline bool raw_check_linux_io_uring(BDRVRawState *s) -{ - Error *local_err = NULL; - AioContext *ctx; - - if (!s->use_linux_io_uring) { - return false; - } - - ctx = qemu_get_current_aio_context(); - if (unlikely(!aio_setup_linux_io_uring(ctx, &local_err))) { - error_reportf_err(local_err, "Unable to use linux io_uring, " - "falling back to thread pool: "); - s->use_linux_io_uring = false; - return false; - } - return true; -} -#endif - #ifdef CONFIG_LINUX_AIO static inline bool raw_check_linux_aio(BDRVRawState *s) { @@ -2595,7 +2583,7 @@ raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr, uint64_t bytes, if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) { type |= QEMU_AIO_MISALIGNED; #ifdef CONFIG_LINUX_IO_URING - } else if (raw_check_linux_io_uring(s)) { + } else if (s->use_linux_io_uring) { assert(qiov->size == bytes); ret = luring_co_submit(bs, s->fd, offset, qiov, type, flags); goto out; @@ -2692,7 +2680,7 @@ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs) }; #ifdef CONFIG_LINUX_IO_URING - if (raw_check_linux_io_uring(s)) { + if (s->use_linux_io_uring) { return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH, 0); } #endif @@ -4574,20 +4562,20 @@ static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked) } static BlockDriver bdrv_host_cdrom = { - .format_name = "host_cdrom", - .protocol_name = "host_cdrom", - .instance_size = sizeof(BDRVRawState), - .bdrv_needs_filename = true, - .bdrv_probe_device = cdrom_probe_device, - .bdrv_parse_filename = cdrom_parse_filename, - .bdrv_open = cdrom_open, - .bdrv_close = raw_close, - .bdrv_reopen_prepare = raw_reopen_prepare, - .bdrv_reopen_commit = raw_reopen_commit, - .bdrv_reopen_abort = raw_reopen_abort, - .bdrv_co_create_opts = bdrv_co_create_opts_simple, - .create_opts = &bdrv_create_opts_simple, - .mutable_opts = mutable_opts, + .format_name = "host_cdrom", + .protocol_name = "host_cdrom", + .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, + .bdrv_probe_device = cdrom_probe_device, + .bdrv_parse_filename = cdrom_parse_filename, + .bdrv_open = cdrom_open, + .bdrv_close = raw_close, + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, + .bdrv_co_create_opts = bdrv_co_create_opts_simple, + .create_opts = &bdrv_create_opts_simple, + .mutable_opts = mutable_opts, .bdrv_co_invalidate_cache = raw_co_invalidate_cache, .bdrv_co_preadv = raw_co_preadv, @@ -4700,20 +4688,20 @@ static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked) } static BlockDriver bdrv_host_cdrom = { - .format_name = "host_cdrom", - .protocol_name = "host_cdrom", - .instance_size = sizeof(BDRVRawState), - .bdrv_needs_filename = true, - .bdrv_probe_device = cdrom_probe_device, - .bdrv_parse_filename = cdrom_parse_filename, - .bdrv_open = cdrom_open, - .bdrv_close = raw_close, - .bdrv_reopen_prepare = raw_reopen_prepare, - .bdrv_reopen_commit = raw_reopen_commit, - .bdrv_reopen_abort = raw_reopen_abort, - .bdrv_co_create_opts = bdrv_co_create_opts_simple, - .create_opts = &bdrv_create_opts_simple, - .mutable_opts = mutable_opts, + .format_name = "host_cdrom", + .protocol_name = "host_cdrom", + .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, + .bdrv_probe_device = cdrom_probe_device, + .bdrv_parse_filename = cdrom_parse_filename, + .bdrv_open = cdrom_open, + .bdrv_close = raw_close, + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, + .bdrv_co_create_opts = bdrv_co_create_opts_simple, + .create_opts = &bdrv_create_opts_simple, + .mutable_opts = mutable_opts, .bdrv_co_preadv = raw_co_preadv, .bdrv_co_pwritev = raw_co_pwritev, diff --git a/block/file-win32.c b/block/file-win32.c index af9aea631c..0efb609e1d 100644 --- a/block/file-win32.c +++ b/block/file-win32.c @@ -741,16 +741,16 @@ static QemuOptsList raw_create_opts = { }; BlockDriver bdrv_file = { - .format_name = "file", - .protocol_name = "file", - .instance_size = sizeof(BDRVRawState), - .bdrv_needs_filename = true, - .bdrv_parse_filename = raw_parse_filename, - .bdrv_open = raw_open, - .bdrv_refresh_limits = raw_probe_alignment, - .bdrv_close = raw_close, - .bdrv_co_create_opts = raw_co_create_opts, - .bdrv_has_zero_init = bdrv_has_zero_init_1, + .format_name = "file", + .protocol_name = "file", + .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, + .bdrv_parse_filename = raw_parse_filename, + .bdrv_open = raw_open, + .bdrv_refresh_limits = raw_probe_alignment, + .bdrv_close = raw_close, + .bdrv_co_create_opts = raw_co_create_opts, + .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_reopen_prepare = raw_reopen_prepare, .bdrv_reopen_commit = raw_reopen_commit, @@ -914,15 +914,15 @@ done: } static BlockDriver bdrv_host_device = { - .format_name = "host_device", - .protocol_name = "host_device", - .instance_size = sizeof(BDRVRawState), - .bdrv_needs_filename = true, - .bdrv_parse_filename = hdev_parse_filename, - .bdrv_probe_device = hdev_probe_device, - .bdrv_open = hdev_open, - .bdrv_close = raw_close, - .bdrv_refresh_limits = hdev_refresh_limits, + .format_name = "host_device", + .protocol_name = "host_device", + .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, + .bdrv_parse_filename = hdev_parse_filename, + .bdrv_probe_device = hdev_probe_device, + .bdrv_open = hdev_open, + .bdrv_close = raw_close, + .bdrv_refresh_limits = hdev_refresh_limits, .bdrv_aio_preadv = raw_aio_preadv, .bdrv_aio_pwritev = raw_aio_pwritev, diff --git a/block/io_uring.c b/block/io_uring.c index dd4f304910..f1514cf024 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -11,28 +11,20 @@ #include "qemu/osdep.h" #include #include "block/aio.h" -#include "qemu/queue.h" #include "block/block.h" #include "block/raw-aio.h" #include "qemu/coroutine.h" -#include "qemu/defer-call.h" -#include "qapi/error.h" #include "system/block-backend.h" #include "trace.h" -/* Only used for assertions. */ -#include "qemu/coroutine_int.h" - -/* io_uring ring size */ -#define MAX_ENTRIES 128 - -typedef struct LuringAIOCB { +typedef struct { Coroutine *co; - struct io_uring_sqe sqeq; - ssize_t ret; QEMUIOVector *qiov; - bool is_read; - QSIMPLEQ_ENTRY(LuringAIOCB) next; + uint64_t offset; + ssize_t ret; + int type; + int fd; + BdrvRequestFlags flags; /* * Buffered reads may require resubmission, see @@ -40,36 +32,69 @@ typedef struct LuringAIOCB { */ int total_read; QEMUIOVector resubmit_qiov; -} LuringAIOCB; -typedef struct LuringQueue { - unsigned int in_queue; - unsigned int in_flight; - bool blocked; - QSIMPLEQ_HEAD(, LuringAIOCB) submit_queue; -} LuringQueue; + CqeHandler cqe_handler; +} LuringRequest; -struct LuringState { - AioContext *aio_context; - - struct io_uring ring; - - /* No locking required, only accessed from AioContext home thread */ - LuringQueue io_q; - - QEMUBH *completion_bh; -}; - -/** - * luring_resubmit: - * - * Resubmit a request by appending it to submit_queue. The caller must ensure - * that ioq_submit() is called later so that submit_queue requests are started. - */ -static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb) +static void luring_prep_sqe(struct io_uring_sqe *sqe, void *opaque) { - QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next); - s->io_q.in_queue++; + LuringRequest *req = opaque; + QEMUIOVector *qiov = req->qiov; + uint64_t offset = req->offset; + int fd = req->fd; + BdrvRequestFlags flags = req->flags; + + switch (req->type) { + case QEMU_AIO_WRITE: + { + int luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0; + if (luring_flags != 0 || qiov->niov > 1) { +#ifdef HAVE_IO_URING_PREP_WRITEV2 + io_uring_prep_writev2(sqe, fd, qiov->iov, + qiov->niov, offset, luring_flags); +#else + /* + * FUA should only be enabled with HAVE_IO_URING_PREP_WRITEV2, see + * luring_has_fua(). + */ + assert(luring_flags == 0); + + io_uring_prep_writev(sqe, fd, qiov->iov, qiov->niov, offset); +#endif + } else { + /* The man page says non-vectored is faster than vectored */ + struct iovec *iov = qiov->iov; + io_uring_prep_write(sqe, fd, iov->iov_base, iov->iov_len, offset); + } + break; + } + case QEMU_AIO_ZONE_APPEND: + io_uring_prep_writev(sqe, fd, qiov->iov, qiov->niov, offset); + break; + case QEMU_AIO_READ: + { + if (req->resubmit_qiov.iov != NULL) { + qiov = &req->resubmit_qiov; + } + if (qiov->niov > 1) { + io_uring_prep_readv(sqe, fd, qiov->iov, qiov->niov, + offset + req->total_read); + } else { + /* The man page says non-vectored is faster than vectored */ + struct iovec *iov = qiov->iov; + io_uring_prep_read(sqe, fd, iov->iov_base, iov->iov_len, + offset + req->total_read); + } + break; + } + case QEMU_AIO_FLUSH: + io_uring_prep_fsync(sqe, fd, IORING_FSYNC_DATASYNC); + break; + default: + fprintf(stderr, "%s: invalid AIO request type, aborting 0x%x.\n", + __func__, req->type); + abort(); + } } /** @@ -78,385 +103,115 @@ static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb) * Short reads are rare but may occur. The remaining read request needs to be * resubmitted. */ -static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb, - int nread) +static void luring_resubmit_short_read(LuringRequest *req, int nread) { QEMUIOVector *resubmit_qiov; size_t remaining; - trace_luring_resubmit_short_read(s, luringcb, nread); + trace_luring_resubmit_short_read(req, nread); /* Update read position */ - luringcb->total_read += nread; - remaining = luringcb->qiov->size - luringcb->total_read; + req->total_read += nread; + remaining = req->qiov->size - req->total_read; /* Shorten qiov */ - resubmit_qiov = &luringcb->resubmit_qiov; + resubmit_qiov = &req->resubmit_qiov; if (resubmit_qiov->iov == NULL) { - qemu_iovec_init(resubmit_qiov, luringcb->qiov->niov); + qemu_iovec_init(resubmit_qiov, req->qiov->niov); } else { qemu_iovec_reset(resubmit_qiov); } - qemu_iovec_concat(resubmit_qiov, luringcb->qiov, luringcb->total_read, - remaining); + qemu_iovec_concat(resubmit_qiov, req->qiov, req->total_read, remaining); - /* Update sqe */ - luringcb->sqeq.off += nread; - luringcb->sqeq.addr = (uintptr_t)luringcb->resubmit_qiov.iov; - luringcb->sqeq.len = luringcb->resubmit_qiov.niov; - - luring_resubmit(s, luringcb); + aio_add_sqe(luring_prep_sqe, req, &req->cqe_handler); } -/** - * luring_process_completions: - * @s: AIO state - * - * Fetches completed I/O requests, consumes cqes and invokes their callbacks - * The function is somewhat tricky because it supports nested event loops, for - * example when a request callback invokes aio_poll(). - * - * Function schedules BH completion so it can be called again in a nested - * event loop. When there are no events left to complete the BH is being - * canceled. - * - */ -static void luring_process_completions(LuringState *s) +static void luring_cqe_handler(CqeHandler *cqe_handler) { - struct io_uring_cqe *cqes; - int total_bytes; + LuringRequest *req = container_of(cqe_handler, LuringRequest, cqe_handler); + int ret = cqe_handler->cqe.res; - defer_call_begin(); + trace_luring_cqe_handler(req, ret); - /* - * Request completion callbacks can run the nested event loop. - * Schedule ourselves so the nested event loop will "see" remaining - * completed requests and process them. Without this, completion - * callbacks that wait for other requests using a nested event loop - * would hang forever. - * - * This workaround is needed because io_uring uses poll_wait, which - * is woken up when new events are added to the uring, thus polling on - * the same uring fd will block unless more events are received. - * - * Other leaf block drivers (drivers that access the data themselves) - * are networking based, so they poll sockets for data and run the - * correct coroutine. - */ - qemu_bh_schedule(s->completion_bh); - - while (io_uring_peek_cqe(&s->ring, &cqes) == 0) { - LuringAIOCB *luringcb; - int ret; - - if (!cqes) { - break; + if (ret < 0) { + /* + * Only writev/readv/fsync requests on regular files or host block + * devices are submitted. Therefore -EAGAIN is not expected but it's + * known to happen sometimes with Linux SCSI. Submit again and hope + * the request completes successfully. + * + * For more information, see: + * https://lore.kernel.org/io-uring/20210727165811.284510-3-axboe@kernel.dk/T/#u + * + * If the code is changed to submit other types of requests in the + * future, then this workaround may need to be extended to deal with + * genuine -EAGAIN results that should not be resubmitted + * immediately. + */ + if (ret == -EINTR || ret == -EAGAIN) { + aio_add_sqe(luring_prep_sqe, req, &req->cqe_handler); + return; } - - luringcb = io_uring_cqe_get_data(cqes); - ret = cqes->res; - io_uring_cqe_seen(&s->ring, cqes); - cqes = NULL; - - /* Change counters one-by-one because we can be nested. */ - s->io_q.in_flight--; - trace_luring_process_completion(s, luringcb, ret); - + } else if (req->qiov) { /* total_read is non-zero only for resubmitted read requests */ - total_bytes = ret + luringcb->total_read; + int total_bytes = ret + req->total_read; - if (ret < 0) { - /* - * Only writev/readv/fsync requests on regular files or host block - * devices are submitted. Therefore -EAGAIN is not expected but it's - * known to happen sometimes with Linux SCSI. Submit again and hope - * the request completes successfully. - * - * For more information, see: - * https://lore.kernel.org/io-uring/20210727165811.284510-3-axboe@kernel.dk/T/#u - * - * If the code is changed to submit other types of requests in the - * future, then this workaround may need to be extended to deal with - * genuine -EAGAIN results that should not be resubmitted - * immediately. - */ - if (ret == -EINTR || ret == -EAGAIN) { - luring_resubmit(s, luringcb); - continue; - } - } else if (!luringcb->qiov) { - goto end; - } else if (total_bytes == luringcb->qiov->size) { + if (total_bytes == req->qiov->size) { ret = 0; - /* Only read/write */ } else { /* Short Read/Write */ - if (luringcb->is_read) { + if (req->type == QEMU_AIO_READ) { if (ret > 0) { - luring_resubmit_short_read(s, luringcb, ret); - continue; - } else { - /* Pad with zeroes */ - qemu_iovec_memset(luringcb->qiov, total_bytes, 0, - luringcb->qiov->size - total_bytes); - ret = 0; + luring_resubmit_short_read(req, ret); + return; } + + /* Pad with zeroes */ + qemu_iovec_memset(req->qiov, total_bytes, 0, + req->qiov->size - total_bytes); + ret = 0; } else { ret = -ENOSPC; } } -end: - luringcb->ret = ret; - qemu_iovec_destroy(&luringcb->resubmit_qiov); - - /* - * If the coroutine is already entered it must be in ioq_submit() - * and will notice luringcb->ret has been filled in when it - * eventually runs later. Coroutines cannot be entered recursively - * so avoid doing that! - */ - assert(luringcb->co->ctx == s->aio_context); - if (!qemu_coroutine_entered(luringcb->co)) { - aio_co_wake(luringcb->co); - } } - qemu_bh_cancel(s->completion_bh); + req->ret = ret; + qemu_iovec_destroy(&req->resubmit_qiov); - defer_call_end(); -} - -static int ioq_submit(LuringState *s) -{ - int ret = 0; - LuringAIOCB *luringcb, *luringcb_next; - - while (s->io_q.in_queue > 0) { - /* - * Try to fetch sqes from the ring for requests waiting in - * the overflow queue - */ - QSIMPLEQ_FOREACH_SAFE(luringcb, &s->io_q.submit_queue, next, - luringcb_next) { - struct io_uring_sqe *sqes = io_uring_get_sqe(&s->ring); - if (!sqes) { - break; - } - /* Prep sqe for submission */ - *sqes = luringcb->sqeq; - QSIMPLEQ_REMOVE_HEAD(&s->io_q.submit_queue, next); - } - ret = io_uring_submit(&s->ring); - trace_luring_io_uring_submit(s, ret); - /* Prevent infinite loop if submission is refused */ - if (ret <= 0) { - if (ret == -EAGAIN || ret == -EINTR) { - continue; - } - break; - } - s->io_q.in_flight += ret; - s->io_q.in_queue -= ret; - } - s->io_q.blocked = (s->io_q.in_queue > 0); - - if (s->io_q.in_flight) { - /* - * We can try to complete something just right away if there are - * still requests in-flight. - */ - luring_process_completions(s); - } - return ret; -} - -static void luring_process_completions_and_submit(LuringState *s) -{ - luring_process_completions(s); - - if (s->io_q.in_queue > 0) { - ioq_submit(s); + /* + * If the coroutine is already entered it must be in luring_co_submit() and + * will notice req->ret has been filled in when it eventually runs later. + * Coroutines cannot be entered recursively so avoid doing that! + */ + if (!qemu_coroutine_entered(req->co)) { + aio_co_wake(req->co); } } -static void qemu_luring_completion_bh(void *opaque) +int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, + uint64_t offset, QEMUIOVector *qiov, + int type, BdrvRequestFlags flags) { - LuringState *s = opaque; - luring_process_completions_and_submit(s); -} - -static void qemu_luring_completion_cb(void *opaque) -{ - LuringState *s = opaque; - luring_process_completions_and_submit(s); -} - -static bool qemu_luring_poll_cb(void *opaque) -{ - LuringState *s = opaque; - - return io_uring_cq_ready(&s->ring); -} - -static void qemu_luring_poll_ready(void *opaque) -{ - LuringState *s = opaque; - - luring_process_completions_and_submit(s); -} - -static void ioq_init(LuringQueue *io_q) -{ - QSIMPLEQ_INIT(&io_q->submit_queue); - io_q->in_queue = 0; - io_q->in_flight = 0; - io_q->blocked = false; -} - -static void luring_deferred_fn(void *opaque) -{ - LuringState *s = opaque; - trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue, - s->io_q.in_flight); - if (!s->io_q.blocked && s->io_q.in_queue > 0) { - ioq_submit(s); - } -} - -/** - * luring_do_submit: - * @fd: file descriptor for I/O - * @luringcb: AIO control block - * @s: AIO state - * @offset: offset for request - * @type: type of request - * - * Fetches sqes from ring, adds to pending queue and preps them - * - */ -static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, - uint64_t offset, int type, BdrvRequestFlags flags) -{ - int ret; - struct io_uring_sqe *sqes = &luringcb->sqeq; - - switch (type) { - case QEMU_AIO_WRITE: -#ifdef HAVE_IO_URING_PREP_WRITEV2 - { - int luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0; - io_uring_prep_writev2(sqes, fd, luringcb->qiov->iov, - luringcb->qiov->niov, offset, luring_flags); - } -#else - assert(flags == 0); - io_uring_prep_writev(sqes, fd, luringcb->qiov->iov, - luringcb->qiov->niov, offset); -#endif - break; - case QEMU_AIO_ZONE_APPEND: - io_uring_prep_writev(sqes, fd, luringcb->qiov->iov, - luringcb->qiov->niov, offset); - break; - case QEMU_AIO_READ: - io_uring_prep_readv(sqes, fd, luringcb->qiov->iov, - luringcb->qiov->niov, offset); - break; - case QEMU_AIO_FLUSH: - io_uring_prep_fsync(sqes, fd, IORING_FSYNC_DATASYNC); - break; - default: - fprintf(stderr, "%s: invalid AIO request type, aborting 0x%x.\n", - __func__, type); - abort(); - } - io_uring_sqe_set_data(sqes, luringcb); - - QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next); - s->io_q.in_queue++; - trace_luring_do_submit(s, s->io_q.blocked, s->io_q.in_queue, - s->io_q.in_flight); - if (!s->io_q.blocked) { - if (s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES) { - ret = ioq_submit(s); - trace_luring_do_submit_done(s, ret); - return ret; - } - - defer_call(luring_deferred_fn, s); - } - return 0; -} - -int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset, - QEMUIOVector *qiov, int type, - BdrvRequestFlags flags) -{ - int ret; - AioContext *ctx = qemu_get_current_aio_context(); - LuringState *s = aio_get_linux_io_uring(ctx); - LuringAIOCB luringcb = { + LuringRequest req = { .co = qemu_coroutine_self(), - .ret = -EINPROGRESS, .qiov = qiov, - .is_read = (type == QEMU_AIO_READ), + .ret = -EINPROGRESS, + .type = type, + .fd = fd, + .offset = offset, + .flags = flags, }; - trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0, - type); - ret = luring_do_submit(fd, &luringcb, s, offset, type, flags); - if (ret < 0) { - return ret; - } + req.cqe_handler.cb = luring_cqe_handler; - if (luringcb.ret == -EINPROGRESS) { + trace_luring_co_submit(bs, &req, fd, offset, qiov ? qiov->size : 0, type); + aio_add_sqe(luring_prep_sqe, &req, &req.cqe_handler); + + if (req.ret == -EINPROGRESS) { qemu_coroutine_yield(); } - return luringcb.ret; -} - -void luring_detach_aio_context(LuringState *s, AioContext *old_context) -{ - aio_set_fd_handler(old_context, s->ring.ring_fd, - NULL, NULL, NULL, NULL, s); - qemu_bh_delete(s->completion_bh); - s->aio_context = NULL; -} - -void luring_attach_aio_context(LuringState *s, AioContext *new_context) -{ - s->aio_context = new_context; - s->completion_bh = aio_bh_new(new_context, qemu_luring_completion_bh, s); - aio_set_fd_handler(s->aio_context, s->ring.ring_fd, - qemu_luring_completion_cb, NULL, - qemu_luring_poll_cb, qemu_luring_poll_ready, s); -} - -LuringState *luring_init(Error **errp) -{ - int rc; - LuringState *s = g_new0(LuringState, 1); - struct io_uring *ring = &s->ring; - - trace_luring_init_state(s, sizeof(*s)); - - rc = io_uring_queue_init(MAX_ENTRIES, ring, 0); - if (rc < 0) { - error_setg_errno(errp, -rc, "failed to init linux io_uring ring"); - g_free(s); - return NULL; - } - - ioq_init(&s->io_q); - return s; - -} - -void luring_cleanup(LuringState *s) -{ - io_uring_queue_exit(&s->ring); - trace_luring_cleanup_state(s); - g_free(s); + return req.ret; } bool luring_has_fua(void) diff --git a/block/parallels.c b/block/parallels.c index 3a375e2a8a..7a90fb5220 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -1117,7 +1117,7 @@ parallels_co_create_opts(BlockDriver *drv, const char *filename, } /* Create and open the file (protocol layer) */ - ret = bdrv_co_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, true, errp); if (ret < 0) { goto done; } diff --git a/block/qcow.c b/block/qcow.c index 8a3e7591a9..3d37d26ee8 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -978,7 +978,7 @@ qcow_co_create_opts(BlockDriver *drv, const char *filename, } /* Create and open the file (protocol layer) */ - ret = bdrv_co_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, true, errp); if (ret < 0) { goto fail; } @@ -1184,11 +1184,11 @@ static const char *const qcow_strong_runtime_opts[] = { }; static BlockDriver bdrv_qcow = { - .format_name = "qcow", - .instance_size = sizeof(BDRVQcowState), - .bdrv_probe = qcow_probe, - .bdrv_open = qcow_open, - .bdrv_close = qcow_close, + .format_name = "qcow", + .instance_size = sizeof(BDRVQcowState), + .bdrv_probe = qcow_probe, + .bdrv_open = qcow_open, + .bdrv_close = qcow_close, .bdrv_child_perm = bdrv_default_perms, .bdrv_reopen_prepare = qcow_reopen_prepare, .bdrv_co_create = qcow_co_create, diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index ce8c0076b3..c655bf6df4 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1978,12 +1978,10 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters, if (!keep_reference) { /* Then decrease the refcount */ qcow2_free_any_cluster(bs, old_l2_entry, type); - } else if (s->discard_passthrough[type] && - (cluster_type == QCOW2_CLUSTER_NORMAL || - cluster_type == QCOW2_CLUSTER_ZERO_ALLOC)) { + } else { /* If we keep the reference, pass on the discard still */ - bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK, - s->cluster_size); + qcow2_discard_cluster(bs, old_l2_entry & L2E_OFFSET_MASK, + s->cluster_size, cluster_type, type); } } @@ -2092,12 +2090,10 @@ zero_in_l2_slice(BlockDriverState *bs, uint64_t offset, if (!keep_reference) { /* Then decrease the refcount */ qcow2_free_any_cluster(bs, old_l2_entry, QCOW2_DISCARD_REQUEST); - } else if (s->discard_passthrough[QCOW2_DISCARD_REQUEST] && - (type == QCOW2_CLUSTER_NORMAL || - type == QCOW2_CLUSTER_ZERO_ALLOC)) { + } else { /* If we keep the reference, pass on the discard still */ - bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK, - s->cluster_size); + qcow2_discard_cluster(bs, old_l2_entry & L2E_OFFSET_MASK, + s->cluster_size, type, QCOW2_DISCARD_REQUEST); } } } diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 0266542cee..6512cda407 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -754,8 +754,8 @@ void qcow2_process_discards(BlockDriverState *bs, int ret) } } -static void update_refcount_discard(BlockDriverState *bs, - uint64_t offset, uint64_t length) +static void queue_discard(BlockDriverState *bs, + uint64_t offset, uint64_t length) { BDRVQcow2State *s = bs->opaque; Qcow2DiscardRegion *d, *p, *next; @@ -902,7 +902,7 @@ update_refcount(BlockDriverState *bs, int64_t offset, int64_t length, } if (s->discard_passthrough[type]) { - update_refcount_discard(bs, cluster_offset, s->cluster_size); + queue_discard(bs, cluster_offset, s->cluster_size); } } } @@ -1205,6 +1205,23 @@ void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry, } } +void qcow2_discard_cluster(BlockDriverState *bs, uint64_t offset, + uint64_t length, QCow2ClusterType ctype, + enum qcow2_discard_type dtype) +{ + BDRVQcow2State *s = bs->opaque; + + if (s->discard_passthrough[dtype] && + (ctype == QCOW2_CLUSTER_NORMAL || + ctype == QCOW2_CLUSTER_ZERO_ALLOC)) { + if (has_data_file(bs)) { + bdrv_pdiscard(s->data_file, offset, length); + } else { + queue_discard(bs, offset, length); + } + } +} + int qcow2_write_caches(BlockDriverState *bs) { BDRVQcow2State *s = bs->opaque; @@ -3619,7 +3636,7 @@ qcow2_discard_refcount_block(BlockDriverState *bs, uint64_t discard_block_offs) /* discard refblock from the cache if refblock is cached */ qcow2_cache_discard(s->refcount_block_cache, refblock); } - update_refcount_discard(bs, discard_block_offs, s->cluster_size); + queue_discard(bs, discard_block_offs, s->cluster_size); return 0; } diff --git a/block/qcow2.c b/block/qcow2.c index 4aa9f9e068..cb0bdb32ec 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -3956,7 +3956,7 @@ qcow2_co_create_opts(BlockDriver *drv, const char *filename, QemuOpts *opts, } /* Create and open the file (protocol layer) */ - ret = bdrv_co_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, true, errp); if (ret < 0) { goto finish; } @@ -3971,7 +3971,7 @@ qcow2_co_create_opts(BlockDriver *drv, const char *filename, QemuOpts *opts, /* Create and open an external data file (protocol layer) */ val = qdict_get_try_str(qdict, BLOCK_OPT_DATA_FILE); if (val) { - ret = bdrv_co_create_file(val, opts, errp); + ret = bdrv_co_create_file(val, opts, false, errp); if (ret < 0) { goto finish; } diff --git a/block/qcow2.h b/block/qcow2.h index a9e3481c6e..547bb2b814 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -880,6 +880,10 @@ void GRAPH_RDLOCK qcow2_free_clusters(BlockDriverState *bs, void GRAPH_RDLOCK qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry, enum qcow2_discard_type type); +void GRAPH_RDLOCK +qcow2_discard_cluster(BlockDriverState *bs, uint64_t offset, + uint64_t length, QCow2ClusterType ctype, + enum qcow2_discard_type dtype); int GRAPH_RDLOCK qcow2_update_snapshot_refcount(BlockDriverState *bs, int64_t l1_table_offset, diff --git a/block/qed.c b/block/qed.c index 4a36fb3929..da23a83d62 100644 --- a/block/qed.c +++ b/block/qed.c @@ -788,7 +788,7 @@ bdrv_qed_co_create_opts(BlockDriver *drv, const char *filename, } /* Create and open the file (protocol layer) */ - ret = bdrv_co_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, true, errp); if (ret < 0) { goto fail; } diff --git a/block/raw-format.c b/block/raw-format.c index df16ac1ea2..a57c2922d5 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -463,7 +463,7 @@ static int coroutine_fn GRAPH_UNLOCKED raw_co_create_opts(BlockDriver *drv, const char *filename, QemuOpts *opts, Error **errp) { - return bdrv_co_create_file(filename, opts, errp); + return bdrv_co_create_file(filename, opts, true, errp); } static int raw_open(BlockDriverState *bs, QDict *options, int flags, diff --git a/block/trace-events b/block/trace-events index 8e789e1f12..c9b4736ff8 100644 --- a/block/trace-events +++ b/block/trace-events @@ -62,15 +62,9 @@ qmp_block_stream(void *bs) "bs %p" file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" # io_uring.c -luring_init_state(void *s, size_t size) "s %p size %zu" -luring_cleanup_state(void *s) "%p freed" -luring_unplug_fn(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d" -luring_do_submit(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d" -luring_do_submit_done(void *s, int ret) "LuringState %p submitted to kernel %d" -luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, size_t nbytes, int type) "bs %p s %p luringcb %p fd %d offset %" PRId64 " nbytes %zd type %d" -luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d" -luring_io_uring_submit(void *s, int ret) "LuringState %p ret %d" -luring_resubmit_short_read(void *s, void *luringcb, int nread) "LuringState %p luringcb %p nread %d" +luring_cqe_handler(void *req, int ret) "req %p ret %d" +luring_co_submit(void *bs, void *req, int fd, uint64_t offset, size_t nbytes, int type) "bs %p req %p fd %d offset %" PRId64 " nbytes %zd type %d" +luring_resubmit_short_read(void *req, int nread) "req %p nread %d" # qcow2.c qcow2_add_task(void *co, void *bs, void *pool, const char *action, int cluster_type, uint64_t host_offset, uint64_t offset, uint64_t bytes, void *qiov, size_t qiov_offset) "co %p bs %p pool %p: %s: cluster_type %d file_cluster_offset %" PRIu64 " offset %" PRIu64 " bytes %" PRIu64 " qiov %p qiov_offset %zu" diff --git a/block/vdi.c b/block/vdi.c index 3ddc62a569..87b874a7ef 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -938,7 +938,7 @@ vdi_co_create_opts(BlockDriver *drv, const char *filename, qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vdi_create_opts, true); /* Create and open the file (protocol layer) */ - ret = bdrv_co_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, true, errp); if (ret < 0) { goto done; } diff --git a/block/vhdx.c b/block/vhdx.c index b2a4b813a0..c16e4a00c8 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -2096,7 +2096,7 @@ vhdx_co_create_opts(BlockDriver *drv, const char *filename, } /* Create and open the file (protocol layer) */ - ret = bdrv_co_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, true, errp); if (ret < 0) { goto fail; } diff --git a/block/vmdk.c b/block/vmdk.c index 7b98debc2b..3b35b63cb5 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -2334,7 +2334,7 @@ vmdk_create_extent(const char *filename, int64_t filesize, bool flat, int ret; BlockBackend *blk = NULL; - ret = bdrv_co_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, false, errp); if (ret < 0) { goto exit; } diff --git a/block/vpc.c b/block/vpc.c index 801ff5793f..07e8ae0309 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -1118,7 +1118,7 @@ vpc_co_create_opts(BlockDriver *drv, const char *filename, } /* Create and open the file (protocol layer) */ - ret = bdrv_co_create_file(filename, opts, errp); + ret = bdrv_co_create_file(filename, opts, true, errp); if (ret < 0) { goto fail; } diff --git a/include/block/aio.h b/include/block/aio.h index 99ff48420b..f38b584ac7 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -61,6 +61,27 @@ typedef struct LuringState LuringState; /* Is polling disabled? */ bool aio_poll_disabled(AioContext *ctx); +#ifdef CONFIG_LINUX_IO_URING +/* + * Each io_uring request must have a unique CqeHandler that processes the cqe. + * The lifetime of a CqeHandler must be at least from aio_add_sqe() until + * ->cb() invocation. + */ +typedef struct CqeHandler CqeHandler; +struct CqeHandler { + /* Called by the AioContext when the request has completed */ + void (*cb)(CqeHandler *handler); + + /* Used internally, do not access this */ + QSIMPLEQ_ENTRY(CqeHandler) next; + + /* This field is filled in before ->cb() is called */ + struct io_uring_cqe cqe; +}; + +typedef QSIMPLEQ_HEAD(, CqeHandler) CqeHandlerSimpleQ; +#endif /* CONFIG_LINUX_IO_URING */ + /* Callbacks for file descriptor monitoring implementations */ typedef struct { /* @@ -106,6 +127,78 @@ typedef struct { * Returns: true if ->wait() should be called, false otherwise. */ bool (*need_wait)(AioContext *ctx); + + /* + * dispatch: + * @ctx: the AioContext + * + * Dispatch any work that is specific to this file descriptor monitoring + * implementation. Usually the event loop's generic file descriptor + * monitoring, BH, and timer dispatching code is sufficient, but file + * descriptor monitoring implementations offering additional functionality + * may need to implement this function for custom behavior. Called at a + * point in the event loop when it is safe to invoke user-defined + * callbacks. + * + * This function is optional and may be NULL. + * + * Returns: true if progress was made (see aio_poll()'s return value), + * false otherwise. + */ + bool (*dispatch)(AioContext *ctx); + + /* + * gsource_prepare: + * @ctx: the AioContext + * + * Prepare for the glib event loop to wait for events instead of the usual + * ->wait() call. See glib's GSourceFuncs->prepare(). + */ + void (*gsource_prepare)(AioContext *ctx); + + /* + * gsource_check: + * @ctx: the AioContext + * + * Called by the glib event loop from glib's GSourceFuncs->check() after + * waiting for events. + * + * Returns: true when ready to be dispatched. + */ + bool (*gsource_check)(AioContext *ctx); + + /* + * gsource_dispatch: + * @ctx: the AioContext + * @ready_list: list for handlers that become ready + * + * Place ready AioHandlers on ready_list. Called as part of the glib event + * loop from glib's GSourceFuncs->dispatch(). + * + * Called with list_lock incremented. + */ + void (*gsource_dispatch)(AioContext *ctx, AioHandlerList *ready_list); + +#ifdef CONFIG_LINUX_IO_URING + /** + * add_sqe: Add an io_uring sqe for submission. + * @prep_sqe: invoked with an sqe that should be prepared for submission + * @opaque: user-defined argument to @prep_sqe() + * @cqe_handler: the unique cqe handler associated with this request + * + * The caller's @prep_sqe() function is invoked to fill in the details of + * the sqe. Do not call io_uring_sqe_set_data() on this sqe. + * + * The kernel may see the sqe as soon as @prep_sqe() returns or it may take + * until the next event loop iteration. + * + * This function is called from the current AioContext and is not + * thread-safe. + */ + void (*add_sqe)(AioContext *ctx, + void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque), + void *opaque, CqeHandler *cqe_handler); +#endif /* CONFIG_LINUX_IO_URING */ } FDMonOps; /* @@ -217,12 +310,14 @@ struct AioContext { struct LinuxAioState *linux_aio; #endif #ifdef CONFIG_LINUX_IO_URING - LuringState *linux_io_uring; - /* State for file descriptor monitoring using Linux io_uring */ struct io_uring fdmon_io_uring; AioHandlerSList submit_list; -#endif + void *io_uring_fd_tag; + + /* Pending callback state for cqe handlers */ + CqeHandlerSimpleQ cqe_handler_ready_list; +#endif /* CONFIG_LINUX_IO_URING */ /* TimerLists for calling timers - one per clock type. Has its own * locking. @@ -254,7 +349,13 @@ struct AioContext { /* epoll(7) state used when built with CONFIG_EPOLL */ int epollfd; + /* The GSource unix fd tag for epollfd */ + void *epollfd_tag; + const FDMonOps *fdmon_ops; + + /* Was aio_context_new() successful? */ + bool initialized; }; /** @@ -512,11 +613,6 @@ struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp); /* Return the LinuxAioState bound to this AioContext */ struct LinuxAioState *aio_get_linux_aio(AioContext *ctx); -/* Setup the LuringState bound to this AioContext */ -LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp); - -/* Return the LuringState bound to this AioContext */ -LuringState *aio_get_linux_io_uring(AioContext *ctx); /** * aio_timer_new_with_attrs: * @ctx: the aio context @@ -679,10 +775,13 @@ void qemu_set_current_aio_context(AioContext *ctx); /** * aio_context_setup: * @ctx: the aio context + * @errp: error pointer * * Initialize the aio context. + * + * Returns: true on success, false otherwise */ -void aio_context_setup(AioContext *ctx); +bool aio_context_setup(AioContext *ctx, Error **errp); /** * aio_context_destroy: @@ -692,9 +791,6 @@ void aio_context_setup(AioContext *ctx); */ void aio_context_destroy(AioContext *ctx); -/* Used internally, do not call outside AioContext code */ -void aio_context_use_g_source(AioContext *ctx); - /** * aio_context_set_poll_params: * @ctx: the aio context @@ -724,4 +820,40 @@ void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch); */ void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min, int64_t max, Error **errp); + +#ifdef CONFIG_LINUX_IO_URING +/** + * aio_has_io_uring: Return whether io_uring is available. + * + * io_uring is either available in all AioContexts or in none, so this only + * needs to be called once from within any thread's AioContext. + */ +static inline bool aio_has_io_uring(void) +{ + AioContext *ctx = qemu_get_current_aio_context(); + return ctx->fdmon_ops->add_sqe; +} + +/** + * aio_add_sqe: Add an io_uring sqe for submission. + * @prep_sqe: invoked with an sqe that should be prepared for submission + * @opaque: user-defined argument to @prep_sqe() + * @cqe_handler: the unique cqe handler associated with this request + * + * The caller's @prep_sqe() function is invoked to fill in the details of the + * sqe. Do not call io_uring_sqe_set_data() on this sqe. + * + * The sqe is submitted by the current AioContext. The kernel may see the sqe + * as soon as @prep_sqe() returns or it may take until the next event loop + * iteration. + * + * When the AioContext is destroyed, pending sqes are ignored and their + * CqeHandlers are not invoked. + * + * This function must be called only when aio_has_io_uring() returns true. + */ +void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque), + void *opaque, CqeHandler *cqe_handler); +#endif /* CONFIG_LINUX_IO_URING */ + #endif diff --git a/include/block/block-global-state.h b/include/block/block-global-state.h index 62da83c616..479ca2858e 100644 --- a/include/block/block-global-state.h +++ b/include/block/block-global-state.h @@ -65,7 +65,8 @@ int co_wrapper bdrv_create(BlockDriver *drv, const char *filename, QemuOpts *opts, Error **errp); int coroutine_fn GRAPH_UNLOCKED -bdrv_co_create_file(const char *filename, QemuOpts *opts, Error **errp); +bdrv_co_create_file(const char *filename, QemuOpts *opts, + bool allow_protocol_prefix, Error **errp); BlockDriverState *bdrv_new(void); int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top, diff --git a/include/block/nbd.h b/include/block/nbd.h index 92987c76fd..ab40842da9 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -296,7 +296,7 @@ enum { NBD_CMD_BLOCK_STATUS = 7, }; -#define NBD_DEFAULT_PORT 10809 +#define NBD_DEFAULT_PORT 10809 /* Maximum size of a single READ/WRITE data buffer */ #define NBD_MAX_BUFFER_SIZE (32 * 1024 * 1024) diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h index 6570244496..30e5fc9a9f 100644 --- a/include/block/raw-aio.h +++ b/include/block/raw-aio.h @@ -74,15 +74,10 @@ static inline bool laio_has_fua(void) #endif /* io_uring.c - Linux io_uring implementation */ #ifdef CONFIG_LINUX_IO_URING -LuringState *luring_init(Error **errp); -void luring_cleanup(LuringState *s); - /* luring_co_submit: submit I/O requests in the thread's current AioContext. */ int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset, QEMUIOVector *qiov, int type, BdrvRequestFlags flags); -void luring_detach_aio_context(LuringState *s, AioContext *old_context); -void luring_attach_aio_context(LuringState *s, AioContext *new_context); bool luring_has_fua(void); #else static inline bool luring_has_fua(void) diff --git a/meson.build b/meson.build index 48c1795b0f..df4460035c 100644 --- a/meson.build +++ b/meson.build @@ -2745,6 +2745,8 @@ endif if linux_io_uring.found() config_host_data.set('HAVE_IO_URING_PREP_WRITEV2', cc.has_header_symbol('liburing.h', 'io_uring_prep_writev2')) + config_host_data.set('HAVE_IO_URING_CQ_HAS_OVERFLOW', + cc.has_header_symbol('liburing.h', 'io_uring_cq_has_overflow')) endif config_host_data.set('HAVE_TCP_KEEPCNT', cc.has_header_symbol('netinet/tcp.h', 'TCP_KEEPCNT') or diff --git a/qemu-img.c b/qemu-img.c index a7791896c1..c42dd4e995 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -4081,7 +4081,7 @@ static int img_rebase(const img_cmd_t *ccmd, int argc, char **argv) n += offset - QEMU_ALIGN_DOWN(offset, write_align); offset = QEMU_ALIGN_DOWN(offset, write_align); n += QEMU_ALIGN_UP(offset + n, write_align) - (offset + n); - n = MIN(n, size - offset); + n = MIN(n, MIN(size - offset, IO_BUF_SIZE)); assert(!bdrv_is_allocated(unfiltered_bs, offset, n, &n_alloc) && n_alloc == n); @@ -4597,9 +4597,9 @@ static int img_amend(const img_cmd_t *ccmd, int argc, char **argv) amend_opts = qemu_opts_append(amend_opts, bs->drv->amend_opts); opts = qemu_opts_create(amend_opts, NULL, 0, &error_abort); if (!qemu_opts_do_parse(opts, options, NULL, &err)) { + qemu_opts_del(opts); /* Try to parse options using the create options */ amend_opts = qemu_opts_append(amend_opts, bs->drv->create_opts); - qemu_opts_del(opts); opts = qemu_opts_create(amend_opts, NULL, 0, &error_abort); if (qemu_opts_do_parse(opts, options, NULL, NULL)) { error_append_hint(&err, diff --git a/stubs/io_uring.c b/stubs/io_uring.c deleted file mode 100644 index 622d1e4648..0000000000 --- a/stubs/io_uring.c +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Linux io_uring support. - * - * Copyright (C) 2009 IBM, Corp. - * Copyright (C) 2009 Red Hat, Inc. - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - */ -#include "qemu/osdep.h" -#include "block/aio.h" -#include "block/raw-aio.h" - -void luring_detach_aio_context(LuringState *s, AioContext *old_context) -{ - abort(); -} - -void luring_attach_aio_context(LuringState *s, AioContext *new_context) -{ - abort(); -} - -LuringState *luring_init(Error **errp) -{ - abort(); -} - -void luring_cleanup(LuringState *s) -{ - abort(); -} diff --git a/stubs/meson.build b/stubs/meson.build index 27be2dec9f..0b2778c568 100644 --- a/stubs/meson.build +++ b/stubs/meson.build @@ -32,9 +32,6 @@ if have_block or have_ga stub_ss.add(files('cpus-virtual-clock.c')) stub_ss.add(files('icount.c')) stub_ss.add(files('graph-lock.c')) - if linux_io_uring.found() - stub_ss.add(files('io_uring.c')) - endif if libaio.found() stub_ss.add(files('linux-aio.c')) endif diff --git a/tests/qemu-iotests/024 b/tests/qemu-iotests/024 index b29c76e161..021169b4a1 100755 --- a/tests/qemu-iotests/024 +++ b/tests/qemu-iotests/024 @@ -315,6 +315,52 @@ echo $QEMU_IMG map "$OVERLAY" | _filter_qemu_img_map +# Check that the region to copy to the overlay during a rebase +# operation does not exceed the I/O buffer size. +# +# backing_new <-- backing_old <-- overlay +# +# Backing (new): -- -- -- -- <-- Empty image, size 4MB +# Backing (old):|--|ff|ff|--| <-- 4 clusters, 1MB each +# Overlay: |-- --|-- --| <-- 2 clusters, 2MB each +# +# The data at [1MB, 3MB) must be copied from the old backing image to +# the overlay. However the rebase code will extend that region to the +# overlay's (sub)cluster boundaries to avoid CoW (see commit 12df580b). +# This test checks that IO_BUF_SIZE (2 MB) is taken into account. + +echo +echo "=== Test that the region to copy does not exceed 2MB (IO_BUF_SIZE) ===" +echo + +echo "Creating backing chain" +echo + +TEST_IMG=$BASE_NEW _make_test_img 4M +TEST_IMG=$BASE_OLD CLUSTER_SIZE=1M _make_test_img -b "$BASE_NEW" -F $IMGFMT +TEST_IMG=$OVERLAY CLUSTER_SIZE=2M _make_test_img -b "$BASE_OLD" -F $IMGFMT + +echo +echo "Writing data to region [1MB, 3MB)" +echo + +$QEMU_IO "$BASE_OLD" -c "write -P 0xff 1M 2M" | _filter_qemu_io + +echo +echo "Rebasing" +echo + +$QEMU_IMG rebase -b "$BASE_NEW" -F $IMGFMT "$OVERLAY" + +echo "Verifying the data" +echo + +$QEMU_IO "$OVERLAY" -c "read -P 0x00 0 1M" | _filter_qemu_io +$QEMU_IO "$OVERLAY" -c "read -P 0xff 1M 2M" | _filter_qemu_io +$QEMU_IO "$OVERLAY" -c "read -P 0x00 3M 1M" | _filter_qemu_io + +$QEMU_IMG map "$OVERLAY" | _filter_qemu_img_map + echo # success, all done diff --git a/tests/qemu-iotests/024.out b/tests/qemu-iotests/024.out index 3d1e31927a..1b7522ba71 100644 --- a/tests/qemu-iotests/024.out +++ b/tests/qemu-iotests/024.out @@ -243,4 +243,30 @@ Offset Length File 0 0x20000 TEST_DIR/subdir/t.IMGFMT 0x40000 0x20000 TEST_DIR/subdir/t.IMGFMT +=== Test that the region to copy does not exceed 2MB (IO_BUF_SIZE) === + +Creating backing chain + +Formatting 'TEST_DIR/subdir/t.IMGFMT.base_new', fmt=IMGFMT size=4194304 +Formatting 'TEST_DIR/subdir/t.IMGFMT.base_old', fmt=IMGFMT size=4194304 backing_file=TEST_DIR/subdir/t.IMGFMT.base_new backing_fmt=IMGFMT +Formatting 'TEST_DIR/subdir/t.IMGFMT', fmt=IMGFMT size=4194304 backing_file=TEST_DIR/subdir/t.IMGFMT.base_old backing_fmt=IMGFMT + +Writing data to region [1MB, 3MB) + +wrote 2097152/2097152 bytes at offset 1048576 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +Rebasing + +Verifying the data + +read 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 2097152/2097152 bytes at offset 1048576 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 1048576/1048576 bytes at offset 3145728 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Offset Length File +0 0x400000 TEST_DIR/subdir/t.IMGFMT + *** done diff --git a/tests/qemu-iotests/184 b/tests/qemu-iotests/184 index 6d0afe9d38..9248b3265d 100755 --- a/tests/qemu-iotests/184 +++ b/tests/qemu-iotests/184 @@ -51,7 +51,7 @@ run_qemu() } test_throttle=$($QEMU_IMG --help|grep throttle) -[ "$test_throttle" = "" ] && _supported_fmt throttle +[ "$test_throttle" = "" ] && _notrun "qemu-img does not support throttle" echo echo "== checking interface ==" diff --git a/tests/qemu-iotests/257 b/tests/qemu-iotests/257 index 7d3720b8e5..cd0468aaa1 100755 --- a/tests/qemu-iotests/257 +++ b/tests/qemu-iotests/257 @@ -310,14 +310,18 @@ def test_bitmap_sync(bsync_mode, msync_mode='bitmap', failure=None): 'state': 1, 'new_state': 2 }, { - 'event': 'read_aio', + 'event': 'flush_to_disk', 'state': 2, 'new_state': 3 + }, { + 'event': "read_aio", + 'state': 3, + 'new_state': 4 }], 'inject-error': [{ 'event': 'read_aio', 'errno': 5, - 'state': 3, + 'state': 4, 'immediately': False, 'once': True }] diff --git a/tests/qemu-iotests/257.out b/tests/qemu-iotests/257.out index c33dd7f3a9..fb28333cb2 100644 --- a/tests/qemu-iotests/257.out +++ b/tests/qemu-iotests/257.out @@ -272,7 +272,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK! --- Preparing image & VM --- -{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}} +{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}} {"return": {}} --- Write #0 --- @@ -1017,7 +1017,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK! --- Preparing image & VM --- -{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}} +{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}} {"return": {}} --- Write #0 --- @@ -1762,7 +1762,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK! --- Preparing image & VM --- -{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}} +{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}} {"return": {}} --- Write #0 --- @@ -2507,7 +2507,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK! --- Preparing image & VM --- -{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}} +{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}} {"return": {}} --- Write #0 --- @@ -3252,7 +3252,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK! --- Preparing image & VM --- -{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}} +{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}} {"return": {}} --- Write #0 --- @@ -3997,7 +3997,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK! --- Preparing image & VM --- -{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}} +{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}} {"return": {}} --- Write #0 --- @@ -4742,7 +4742,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK! --- Preparing image & VM --- -{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}} +{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}} {"return": {}} --- Write #0 --- diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check index d9b7c1d598..3941eac8e2 100755 --- a/tests/qemu-iotests/check +++ b/tests/qemu-iotests/check @@ -17,6 +17,7 @@ # along with this program. If not, see . import os +import re import sys import argparse import shutil @@ -82,7 +83,7 @@ def make_argparser() -> argparse.ArgumentParser: g_env.add_argument('-i', dest='aiomode', default='threads', help='sets AIOMODE environment variable') - p.set_defaults(imgfmt='raw', imgproto='file') + p.set_defaults(imgproto='file') format_list = ['raw', 'bochs', 'cloop', 'parallels', 'qcow', 'qcow2', 'qed', 'vdi', 'vpc', 'vhdx', 'vmdk', 'luks', 'dmg', 'vvfat'] @@ -137,15 +138,50 @@ def make_argparser() -> argparse.ArgumentParser: return p +def dry_run_list(test_dir, imgfmt, testlist): + for t in testlist: + if not imgfmt: + print('\n'.join([os.path.basename(t)])) + continue + # If a format has been given, we look for the "supported_fmt" + # and the "unsupported_fmt" lines in the test and try to find out + # whether the format is supported or not. This is only heuristics + # (it can e.g. fail if the "unsupported_fmts" and "supported_fmts" + # statements are in the same line), but it should be good enough + # to get a proper list for "make check-block" + with open(os.path.join(test_dir, t), 'r', encoding='utf-8') as fh: + supported = True + check_next_line = False + sd = "[ \t'\"]" # Start delimiter + ed = "([ \t'\"]|$)" # End delimiter + for line in fh: + if 'unsupported_fmt' in line: + if re.search(sd + imgfmt + ed, line): + supported = False + break + elif 'supported_fmt' in line or check_next_line: + if re.search(sd + 'generic' + ed, line): + continue # Might be followed by "unsupported" line + supported = re.search(sd + imgfmt + ed, line) + check_next_line = not ']' in line and \ + ('supported_fmts=[' in line or check_next_line) + if supported or not check_next_line: + break + if supported: + print('\n'.join([os.path.basename(t)])) + + if __name__ == '__main__': warnings.simplefilter("default") os.environ["PYTHONWARNINGS"] = "default" args = make_argparser().parse_args() + image_format = args.imgfmt or 'raw' + env = TestEnv(source_dir=args.source_dir, build_dir=args.build_dir, - imgfmt=args.imgfmt, imgproto=args.imgproto, + imgfmt=image_format, imgproto=args.imgproto, aiomode=args.aiomode, cachemode=args.cachemode, imgopts=args.imgopts, misalign=args.misalign, debug=args.debug, valgrind=args.valgrind, @@ -189,7 +225,7 @@ if __name__ == '__main__': if args.dry_run: with env: - print('\n'.join([os.path.basename(t) for t in tests])) + dry_run_list(env.source_iotests, args.imgfmt, tests) else: with TestRunner(env, tap=args.tap, color=args.color) as tr: diff --git a/tests/qemu-iotests/meson.build b/tests/qemu-iotests/meson.build index fad340ad59..d7bae71ced 100644 --- a/tests/qemu-iotests/meson.build +++ b/tests/qemu-iotests/meson.build @@ -2,14 +2,6 @@ if not have_tools or host_os == 'windows' subdir_done() endif -foreach cflag: qemu_ldflags - if cflag.startswith('-fsanitize') and \ - not cflag.contains('safe-stack') and not cflag.contains('cfi-icall') - message('Sanitizers are enabled ==> Disabled the qemu-iotests.') - subdir_done() - endif -endforeach - bash = find_program('bash', required: false, version: '>= 4.0') if not bash.found() message('bash >= v4.0 not available ==> Disabled the qemu-iotests.') @@ -21,7 +13,10 @@ qemu_iotests_env = {'PYTHON': python.full_path()} qemu_iotests_formats = { 'qcow2': 'quick', 'raw': 'slow', + 'parallels': 'thorough', 'qed': 'thorough', + 'vdi': 'thorough', + 'vhdx': 'thorough', 'vmdk': 'thorough', 'vpc': 'thorough' } diff --git a/tests/qemu-iotests/testrunner.py b/tests/qemu-iotests/testrunner.py index 14cc8492f9..e2a3658994 100644 --- a/tests/qemu-iotests/testrunner.py +++ b/tests/qemu-iotests/testrunner.py @@ -263,10 +263,21 @@ class TestRunner(contextlib.AbstractContextManager['TestRunner']): Path(env[d]).mkdir(parents=True, exist_ok=True) test_dir = env['TEST_DIR'] + f_asan = Path(test_dir, f_test.name + '.out.asan') f_bad = Path(test_dir, f_test.name + '.out.bad') f_notrun = Path(test_dir, f_test.name + '.notrun') f_casenotrun = Path(test_dir, f_test.name + '.casenotrun') + env['ASAN_OPTIONS'] = f'detect_leaks=0:log_path={f_asan}' + + def unlink_asan(): + with os.scandir(test_dir) as it: + for entry in it: + if entry.name.startswith(f_asan.name): + os.unlink(entry) + + unlink_asan() + for p in (f_notrun, f_casenotrun): silent_unlink(p) @@ -312,6 +323,7 @@ class TestRunner(contextlib.AbstractContextManager['TestRunner']): description=f'output mismatch (see {f_bad})', diff=diff, casenotrun=casenotrun) else: + unlink_asan() f_bad.unlink() return TestResult(status='pass', elapsed=elapsed, casenotrun=casenotrun) diff --git a/tests/qemu-iotests/tests/resize-below-raw b/tests/qemu-iotests/tests/resize-below-raw index 3c9241c918..ddf3f44742 100755 --- a/tests/qemu-iotests/tests/resize-below-raw +++ b/tests/qemu-iotests/tests/resize-below-raw @@ -8,16 +8,27 @@ # SPDX-License-Identifier: GPL-2.0-or-later import os +from typing import Dict, Optional + import iotests from iotests import imgfmt, qemu_img_create, QMPTestCase image_size = 1 * 1024 * 1024 image = os.path.join(iotests.test_dir, 'test.img') -class TestResizeBelowRaw(QMPTestCase): +class BaseResizeBelowRaw(QMPTestCase): + raw_size: Optional[int] = None + raw_offset: Optional[int] = None + def setUp(self) -> None: qemu_img_create('-f', imgfmt, image, str(image_size)) + extra_options: Dict[str, str] = {} + if self.raw_size is not None: + extra_options['size'] = str(self.raw_size) + if self.raw_offset is not None: + extra_options['offset'] = str(self.raw_offset) + self.vm = iotests.VM() self.vm.add_blockdev(self.vm.qmp_to_opts({ 'driver': imgfmt, @@ -26,7 +37,8 @@ class TestResizeBelowRaw(QMPTestCase): 'driver': 'file', 'filename': image, 'node-name': 'file0', - } + }, + **extra_options })) self.vm.launch() @@ -34,14 +46,16 @@ class TestResizeBelowRaw(QMPTestCase): self.vm.shutdown() os.remove(image) - def assert_size(self, size: int) -> None: + def assert_size(self, size: int, file_size: Optional[int] = None) -> None: nodes = self.vm.qmp('query-named-block-nodes', flat=True)['return'] self.assertEqual(len(nodes), 2) for node in nodes: - if node['drv'] == 'file': + if node['drv'] == 'file' and file_size is not None: + self.assertEqual(node['image']['virtual-size'], file_size) continue self.assertEqual(node['image']['virtual-size'], size) +class TestResizeBelowUnlimitedRaw(BaseResizeBelowRaw): def test_resize_below_raw(self) -> None: self.assert_size(image_size) self.vm.qmp('block_resize', node_name='file0', size=2*image_size) @@ -49,5 +63,36 @@ class TestResizeBelowRaw(QMPTestCase): self.vm.qmp('block_resize', node_name='node0', size=3*image_size) self.assert_size(3*image_size) +# offset = 0 behaves the same as absent offset +class TestResizeBelowRawWithZeroOffset(TestResizeBelowUnlimitedRaw): + raw_offset = 0 + +class TestResizeBelowRawWithSize(BaseResizeBelowRaw): + raw_size = image_size // 2 + + def test_resize_below_raw_with_size(self) -> None: + self.assert_size(image_size // 2, image_size) + + # This QMP command fails because node0 unshares RESIZE + self.vm.qmp('block_resize', node_name='file0', size=2*image_size) + self.assert_size(image_size // 2, image_size) + + # This QMP command fails because node0 is a fixed-size disk + self.vm.qmp('block_resize', node_name='node0', size=3*image_size) + self.assert_size(image_size // 2, image_size) + +class TestResizeBelowRawWithOffset(BaseResizeBelowRaw): + raw_offset = image_size // 4 + + def test_resize_below_raw_with_offset(self) -> None: + self.assert_size(image_size * 3 // 4, image_size) + + # This QMP command fails because node0 unshares RESIZE + self.vm.qmp('block_resize', node_name='file0', size=2*image_size) + self.assert_size(image_size * 3 // 4, image_size) + + self.vm.qmp('block_resize', node_name='node0', size=3*image_size) + self.assert_size(3 * image_size, image_size * 13 // 4) + if __name__ == '__main__': iotests.main(supported_fmts=['raw'], supported_protocols=['file']) diff --git a/tests/qemu-iotests/tests/resize-below-raw.out b/tests/qemu-iotests/tests/resize-below-raw.out index ae1213e6f8..89968f35d7 100644 --- a/tests/qemu-iotests/tests/resize-below-raw.out +++ b/tests/qemu-iotests/tests/resize-below-raw.out @@ -1,5 +1,5 @@ -. +.... ---------------------------------------------------------------------- -Ran 1 tests +Ran 4 tests OK diff --git a/tests/unit/test-aio.c b/tests/unit/test-aio.c index e77d86be87..010d65b79a 100644 --- a/tests/unit/test-aio.c +++ b/tests/unit/test-aio.c @@ -527,7 +527,12 @@ static void test_source_bh_delete_from_cb(void) g_assert_cmpint(data1.n, ==, data1.max); g_assert(data1.bh == NULL); - assert(g_main_context_iteration(NULL, false)); + /* + * There may be up to one more iteration due to the aio_notify + * EventNotifier. + */ + g_main_context_iteration(NULL, false); + assert(!g_main_context_iteration(NULL, false)); } diff --git a/tests/unit/test-nested-aio-poll.c b/tests/unit/test-nested-aio-poll.c index d8fd92c43b..d13ecccd8c 100644 --- a/tests/unit/test-nested-aio-poll.c +++ b/tests/unit/test-nested-aio-poll.c @@ -15,6 +15,7 @@ #include "qemu/osdep.h" #include "block/aio.h" #include "qapi/error.h" +#include "util/aio-posix.h" typedef struct { AioContext *ctx; @@ -71,17 +72,17 @@ static void test(void) .ctx = aio_context_new(&error_abort), }; + if (td.ctx->fdmon_ops != &fdmon_poll_ops) { + /* This test is tied to fdmon-poll.c */ + g_test_skip("fdmon_poll_ops not in use"); + return; + } + qemu_set_current_aio_context(td.ctx); /* Enable polling */ aio_context_set_poll_params(td.ctx, 1000000, 2, 2, &error_abort); - /* - * The GSource is unused but this has the side-effect of changing the fdmon - * that AioContext uses. - */ - aio_get_g_source(td.ctx); - /* Make the event notifier active (set) right away */ event_notifier_init(&td.poll_notifier, 1); aio_set_event_notifier(td.ctx, &td.poll_notifier, diff --git a/util/aio-posix.c b/util/aio-posix.c index 2e0a5dadc4..e24b955fd9 100644 --- a/util/aio-posix.c +++ b/util/aio-posix.c @@ -16,6 +16,7 @@ #include "qemu/osdep.h" #include "block/block.h" #include "block/thread-pool.h" +#include "qapi/error.h" #include "qemu/main-loop.h" #include "qemu/lockcnt.h" #include "qemu/rcu.h" @@ -70,15 +71,6 @@ static AioHandler *find_aio_handler(AioContext *ctx, int fd) static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node) { - /* If the GSource is in the process of being destroyed then - * g_source_remove_poll() causes an assertion failure. Skip - * removal in that case, because glib cleans up its state during - * destruction anyway. - */ - if (!g_source_is_destroyed(&ctx->source)) { - g_source_remove_poll(&ctx->source, &node->pfd); - } - node->pfd.revents = 0; node->poll_ready = false; @@ -153,7 +145,6 @@ void aio_set_fd_handler(AioContext *ctx, } else { new_node->pfd = node->pfd; } - g_source_add_poll(&ctx->source, &new_node->pfd); new_node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0); new_node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0); @@ -267,37 +258,13 @@ bool aio_prepare(AioContext *ctx) poll_set_started(ctx, &ready_list, false); /* TODO what to do with this list? */ + ctx->fdmon_ops->gsource_prepare(ctx); return false; } bool aio_pending(AioContext *ctx) { - AioHandler *node; - bool result = false; - - /* - * We have to walk very carefully in case aio_set_fd_handler is - * called while we're walking. - */ - qemu_lockcnt_inc(&ctx->list_lock); - - QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { - int revents; - - /* TODO should this check poll ready? */ - revents = node->pfd.revents & node->pfd.events; - if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read) { - result = true; - break; - } - if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write) { - result = true; - break; - } - } - qemu_lockcnt_dec(&ctx->list_lock); - - return result; + return ctx->fdmon_ops->gsource_check(ctx); } static void aio_free_deleted_handlers(AioContext *ctx) @@ -390,10 +357,6 @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node) return progress; } -/* - * If we have a list of ready handlers then this is more efficient than - * scanning all handlers with aio_dispatch_handlers(). - */ static bool aio_dispatch_ready_handlers(AioContext *ctx, AioHandlerList *ready_list, int64_t block_ns) @@ -417,24 +380,23 @@ static bool aio_dispatch_ready_handlers(AioContext *ctx, return progress; } -/* Slower than aio_dispatch_ready_handlers() but only used via glib */ -static bool aio_dispatch_handlers(AioContext *ctx) -{ - AioHandler *node, *tmp; - bool progress = false; - - QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) { - progress = aio_dispatch_handler(ctx, node) || progress; - } - - return progress; -} - void aio_dispatch(AioContext *ctx) { + AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list); + qemu_lockcnt_inc(&ctx->list_lock); + aio_bh_poll(ctx); - aio_dispatch_handlers(ctx); + + ctx->fdmon_ops->gsource_dispatch(ctx, &ready_list); + + if (ctx->fdmon_ops->dispatch) { + ctx->fdmon_ops->dispatch(ctx); + } + + /* block_ns is 0 because polling is disabled in the glib event loop */ + aio_dispatch_ready_handlers(ctx, &ready_list, 0); + aio_free_deleted_handlers(ctx); qemu_lockcnt_dec(&ctx->list_lock); @@ -559,7 +521,14 @@ static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list, elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time; max_ns = qemu_soonest_timeout(*timeout, max_ns); assert(!(max_ns && progress)); - } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx)); + + if (ctx->fdmon_ops->need_wait(ctx)) { + if (fdmon_supports_polling(ctx)) { + *timeout = 0; /* stay in polling mode */ + } + break; + } + } while (elapsed_time < max_ns); if (remove_idle_poll_handlers(ctx, ready_list, start_time + elapsed_time)) { @@ -722,7 +691,7 @@ bool aio_poll(AioContext *ctx, bool blocking) * up IO threads when some work becomes pending. It is essential to * avoid hangs or unnecessary latency. */ - if (poll_set_started(ctx, &ready_list, false)) { + if (timeout && poll_set_started(ctx, &ready_list, false)) { timeout = 0; progress = true; } @@ -743,6 +712,10 @@ bool aio_poll(AioContext *ctx, bool blocking) block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start; } + if (ctx->fdmon_ops->dispatch) { + progress |= ctx->fdmon_ops->dispatch(ctx); + } + progress |= aio_bh_poll(ctx); progress |= aio_dispatch_ready_handlers(ctx, &ready_list, block_ns); @@ -755,35 +728,50 @@ bool aio_poll(AioContext *ctx, bool blocking) return progress; } -void aio_context_setup(AioContext *ctx) +bool aio_context_setup(AioContext *ctx, Error **errp) { ctx->fdmon_ops = &fdmon_poll_ops; ctx->epollfd = -1; + ctx->epollfd_tag = NULL; - /* Use the fastest fd monitoring implementation if available */ - if (fdmon_io_uring_setup(ctx)) { - return; +#ifdef CONFIG_LINUX_IO_URING + { + static bool need_io_uring; + Error *local_err = NULL; /* ERRP_GUARD() doesn't handle error_abort */ + + /* io_uring takes precedence because it provides aio_add_sqe() support */ + if (fdmon_io_uring_setup(ctx, &local_err)) { + /* + * If one AioContext gets io_uring, then all AioContexts need io_uring + * so that aio_add_sqe() support is available across all threads. + */ + need_io_uring = true; + return true; + } + if (need_io_uring) { + error_propagate(errp, local_err); + return false; + } + + /* Silently fall back on systems where io_uring is unavailable */ + error_free(local_err); } +#endif /* CONFIG_LINUX_IO_URING */ fdmon_epoll_setup(ctx); + return true; } void aio_context_destroy(AioContext *ctx) { +#ifdef CONFIG_LINUX_IO_URING fdmon_io_uring_destroy(ctx); - fdmon_epoll_disable(ctx); - aio_free_deleted_handlers(ctx); -} +#endif + + qemu_lockcnt_lock(&ctx->list_lock); + fdmon_epoll_disable(ctx); + qemu_lockcnt_unlock(&ctx->list_lock); -void aio_context_use_g_source(AioContext *ctx) -{ - /* - * Disable io_uring when the glib main loop is used because it doesn't - * support mixed glib/aio_poll() usage. It relies on aio_poll() being - * called regularly so that changes to the monitored file descriptors are - * submitted, otherwise a list of pending fd handlers builds up. - */ - fdmon_io_uring_destroy(ctx); aio_free_deleted_handlers(ctx); } @@ -818,3 +806,12 @@ void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch) aio_notify(ctx); } + +#ifdef CONFIG_LINUX_IO_URING +void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque), + void *opaque, CqeHandler *cqe_handler) +{ + AioContext *ctx = qemu_get_current_aio_context(); + ctx->fdmon_ops->add_sqe(ctx, prep_sqe, opaque, cqe_handler); +} +#endif /* CONFIG_LINUX_IO_URING */ diff --git a/util/aio-posix.h b/util/aio-posix.h index 82a0201ea4..babbfa8314 100644 --- a/util/aio-posix.h +++ b/util/aio-posix.h @@ -18,6 +18,7 @@ #define AIO_POSIX_H #include "block/aio.h" +#include "qapi/error.h" struct AioHandler { GPollFD pfd; @@ -35,6 +36,7 @@ struct AioHandler { #ifdef CONFIG_LINUX_IO_URING QSLIST_ENTRY(AioHandler) node_submitted; unsigned flags; /* see fdmon-io_uring.c */ + CqeHandler internal_cqe_handler; /* used for POLL_ADD/POLL_REMOVE */ #endif int64_t poll_idle_timeout; /* when to stop userspace polling */ bool poll_ready; /* has polling detected an event? */ @@ -47,9 +49,14 @@ void aio_add_ready_handler(AioHandlerList *ready_list, AioHandler *node, extern const FDMonOps fdmon_poll_ops; +/* Switch back to poll(2). list_lock must be held. */ +void fdmon_poll_downgrade(AioContext *ctx); + #ifdef CONFIG_EPOLL_CREATE1 bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd); void fdmon_epoll_setup(AioContext *ctx); + +/* list_lock must be held */ void fdmon_epoll_disable(AioContext *ctx); #else static inline bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd) @@ -67,17 +74,8 @@ static inline void fdmon_epoll_disable(AioContext *ctx) #endif /* !CONFIG_EPOLL_CREATE1 */ #ifdef CONFIG_LINUX_IO_URING -bool fdmon_io_uring_setup(AioContext *ctx); +bool fdmon_io_uring_setup(AioContext *ctx, Error **errp); void fdmon_io_uring_destroy(AioContext *ctx); -#else -static inline bool fdmon_io_uring_setup(AioContext *ctx) -{ - return false; -} - -static inline void fdmon_io_uring_destroy(AioContext *ctx) -{ -} #endif /* !CONFIG_LINUX_IO_URING */ #endif /* AIO_POSIX_H */ diff --git a/util/aio-win32.c b/util/aio-win32.c index c6fbce64c2..6e6f699e4b 100644 --- a/util/aio-win32.c +++ b/util/aio-win32.c @@ -419,18 +419,15 @@ bool aio_poll(AioContext *ctx, bool blocking) return progress; } -void aio_context_setup(AioContext *ctx) +bool aio_context_setup(AioContext *ctx, Error **errp) { + return true; } void aio_context_destroy(AioContext *ctx) { } -void aio_context_use_g_source(AioContext *ctx) -{ -} - void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, int64_t grow, int64_t shrink, Error **errp) { diff --git a/util/async.c b/util/async.c index a736d2cd0d..48f8828636 100644 --- a/util/async.c +++ b/util/async.c @@ -366,12 +366,16 @@ aio_ctx_dispatch(GSource *source, } static void -aio_ctx_finalize(GSource *source) +aio_ctx_finalize(GSource *source) { AioContext *ctx = (AioContext *) source; QEMUBH *bh; unsigned flags; + if (!ctx->initialized) { + return; + } + thread_pool_free_aio(ctx->thread_pool); #ifdef CONFIG_LINUX_AIO @@ -382,14 +386,6 @@ aio_ctx_finalize(GSource *source) } #endif -#ifdef CONFIG_LINUX_IO_URING - if (ctx->linux_io_uring) { - luring_detach_aio_context(ctx->linux_io_uring, ctx); - luring_cleanup(ctx->linux_io_uring); - ctx->linux_io_uring = NULL; - } -#endif - assert(QSLIST_EMPTY(&ctx->scheduled_coroutines)); qemu_bh_delete(ctx->co_schedule_bh); @@ -418,10 +414,11 @@ aio_ctx_finalize(GSource *source) aio_set_event_notifier(ctx, &ctx->notifier, NULL, NULL, NULL); event_notifier_cleanup(&ctx->notifier); qemu_rec_mutex_destroy(&ctx->lock); - qemu_lockcnt_destroy(&ctx->list_lock); timerlistgroup_deinit(&ctx->tlg); unregister_aiocontext(ctx); aio_context_destroy(ctx); + /* aio_context_destroy() still needs the lock */ + qemu_lockcnt_destroy(&ctx->list_lock); } static GSourceFuncs aio_source_funcs = { @@ -433,7 +430,6 @@ static GSourceFuncs aio_source_funcs = { GSource *aio_get_g_source(AioContext *ctx) { - aio_context_use_g_source(ctx); g_source_ref(&ctx->source); return &ctx->source; } @@ -465,29 +461,6 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx) } #endif -#ifdef CONFIG_LINUX_IO_URING -LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp) -{ - if (ctx->linux_io_uring) { - return ctx->linux_io_uring; - } - - ctx->linux_io_uring = luring_init(errp); - if (!ctx->linux_io_uring) { - return NULL; - } - - luring_attach_aio_context(ctx->linux_io_uring, ctx); - return ctx->linux_io_uring; -} - -LuringState *aio_get_linux_io_uring(AioContext *ctx) -{ - assert(ctx->linux_io_uring); - return ctx->linux_io_uring; -} -#endif - void aio_notify(AioContext *ctx) { /* @@ -577,19 +550,42 @@ static void co_schedule_bh_cb(void *opaque) AioContext *aio_context_new(Error **errp) { + ERRP_GUARD(); int ret; AioContext *ctx; + /* + * ctx is freed by g_source_unref() (e.g. aio_context_unref()). ctx's + * resources are freed as follows: + * + * 1. By aio_ctx_finalize() after aio_context_new() has returned and set + * ->initialized = true. + * + * 2. By manual cleanup code in this function's error paths before goto + * fail. + * + * Be careful to free resources in both cases! + */ ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext)); QSLIST_INIT(&ctx->bh_list); QSIMPLEQ_INIT(&ctx->bh_slice_list); - aio_context_setup(ctx); ret = event_notifier_init(&ctx->notifier, false); if (ret < 0) { error_setg_errno(errp, -ret, "Failed to initialize event notifier"); goto fail; } + + /* + * Resources cannot easily be freed manually after aio_context_setup(). If + * you add any new resources to AioContext, it's probably best to acquire + * them before aio_context_setup(). + */ + if (!aio_context_setup(ctx, errp)) { + event_notifier_cleanup(&ctx->notifier); + goto fail; + } + g_source_set_can_recurse(&ctx->source, true); qemu_lockcnt_init(&ctx->list_lock); @@ -604,10 +600,6 @@ AioContext *aio_context_new(Error **errp) ctx->linux_aio = NULL; #endif -#ifdef CONFIG_LINUX_IO_URING - ctx->linux_io_uring = NULL; -#endif - ctx->thread_pool = NULL; qemu_rec_mutex_init(&ctx->lock); timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx); @@ -623,9 +615,11 @@ AioContext *aio_context_new(Error **errp) register_aiocontext(ctx); + ctx->initialized = true; + return ctx; fail: - g_source_destroy(&ctx->source); + g_source_unref(&ctx->source); return NULL; } diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c index 9fb8800dde..61118e1ee6 100644 --- a/util/fdmon-epoll.c +++ b/util/fdmon-epoll.c @@ -19,8 +19,12 @@ void fdmon_epoll_disable(AioContext *ctx) ctx->epollfd = -1; } - /* Switch back */ - ctx->fdmon_ops = &fdmon_poll_ops; + if (ctx->epollfd_tag) { + g_source_remove_unix_fd(&ctx->source, ctx->epollfd_tag); + ctx->epollfd_tag = NULL; + } + + fdmon_poll_downgrade(ctx); } static inline int epoll_events_from_pfd(int pfd_events) @@ -93,10 +97,29 @@ out: return ret; } +static void fdmon_epoll_gsource_prepare(AioContext *ctx) +{ + /* Do nothing */ +} + +static bool fdmon_epoll_gsource_check(AioContext *ctx) +{ + return g_source_query_unix_fd(&ctx->source, ctx->epollfd_tag) & G_IO_IN; +} + +static void fdmon_epoll_gsource_dispatch(AioContext *ctx, + AioHandlerList *ready_list) +{ + fdmon_epoll_wait(ctx, ready_list, 0); +} + static const FDMonOps fdmon_epoll_ops = { .update = fdmon_epoll_update, .wait = fdmon_epoll_wait, .need_wait = aio_poll_disabled, + .gsource_prepare = fdmon_epoll_gsource_prepare, + .gsource_check = fdmon_epoll_gsource_check, + .gsource_dispatch = fdmon_epoll_gsource_dispatch, }; static bool fdmon_epoll_try_enable(AioContext *ctx) @@ -118,6 +141,8 @@ static bool fdmon_epoll_try_enable(AioContext *ctx) } ctx->fdmon_ops = &fdmon_epoll_ops; + ctx->epollfd_tag = g_source_add_unix_fd(&ctx->source, ctx->epollfd, + G_IO_IN); return true; } @@ -139,12 +164,11 @@ bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd) } ok = fdmon_epoll_try_enable(ctx); - - qemu_lockcnt_inc_and_unlock(&ctx->list_lock); - if (!ok) { fdmon_epoll_disable(ctx); } + + qemu_lockcnt_inc_and_unlock(&ctx->list_lock); return ok; } diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c index b0d68bdc44..d0b56127c6 100644 --- a/util/fdmon-io_uring.c +++ b/util/fdmon-io_uring.c @@ -45,16 +45,20 @@ #include "qemu/osdep.h" #include +#include "qapi/error.h" +#include "qemu/defer-call.h" #include "qemu/rcu_queue.h" #include "aio-posix.h" +#include "trace.h" enum { FDMON_IO_URING_ENTRIES = 128, /* sq/cq ring size */ /* AioHandler::flags */ - FDMON_IO_URING_PENDING = (1 << 0), - FDMON_IO_URING_ADD = (1 << 1), - FDMON_IO_URING_REMOVE = (1 << 2), + FDMON_IO_URING_PENDING = (1 << 0), + FDMON_IO_URING_ADD = (1 << 1), + FDMON_IO_URING_REMOVE = (1 << 2), + FDMON_IO_URING_DELETE_AIO_HANDLER = (1 << 3), }; static inline int poll_events_from_pfd(int pfd_events) @@ -74,8 +78,8 @@ static inline int pfd_events_from_poll(int poll_events) } /* - * Returns an sqe for submitting a request. Only be called within - * fdmon_io_uring_wait(). + * Returns an sqe for submitting a request. Only called from the AioContext + * thread. */ static struct io_uring_sqe *get_sqe(AioContext *ctx) { @@ -166,41 +170,50 @@ static void fdmon_io_uring_update(AioContext *ctx, } } +static void fdmon_io_uring_add_sqe(AioContext *ctx, + void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque), + void *opaque, CqeHandler *cqe_handler) +{ + struct io_uring_sqe *sqe = get_sqe(ctx); + + prep_sqe(sqe, opaque); + io_uring_sqe_set_data(sqe, cqe_handler); + + trace_fdmon_io_uring_add_sqe(ctx, opaque, sqe->opcode, sqe->fd, sqe->off, + cqe_handler); +} + +static void fdmon_special_cqe_handler(CqeHandler *cqe_handler) +{ + /* + * This is an empty function that is never called. It is used as a function + * pointer to distinguish it from ordinary cqe handlers. + */ +} + static void add_poll_add_sqe(AioContext *ctx, AioHandler *node) { struct io_uring_sqe *sqe = get_sqe(ctx); int events = poll_events_from_pfd(node->pfd.events); io_uring_prep_poll_add(sqe, node->pfd.fd, events); - io_uring_sqe_set_data(sqe, node); + node->internal_cqe_handler.cb = fdmon_special_cqe_handler; + io_uring_sqe_set_data(sqe, &node->internal_cqe_handler); } static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node) { struct io_uring_sqe *sqe = get_sqe(ctx); + CqeHandler *cqe_handler = &node->internal_cqe_handler; #ifdef LIBURING_HAVE_DATA64 - io_uring_prep_poll_remove(sqe, (uintptr_t)node); + io_uring_prep_poll_remove(sqe, (uintptr_t)cqe_handler); #else - io_uring_prep_poll_remove(sqe, node); + io_uring_prep_poll_remove(sqe, cqe_handler); #endif io_uring_sqe_set_data(sqe, NULL); } -/* Add a timeout that self-cancels when another cqe becomes ready */ -static void add_timeout_sqe(AioContext *ctx, int64_t ns) -{ - struct io_uring_sqe *sqe; - struct __kernel_timespec ts = { - .tv_sec = ns / NANOSECONDS_PER_SECOND, - .tv_nsec = ns % NANOSECONDS_PER_SECOND, - }; - - sqe = get_sqe(ctx); - io_uring_prep_timeout(sqe, &ts, 1, 0); - io_uring_sqe_set_data(sqe, NULL); -} - /* Add sqes from ctx->submit_list for submission */ static void fill_sq_ring(AioContext *ctx) { @@ -218,22 +231,26 @@ static void fill_sq_ring(AioContext *ctx) if (flags & FDMON_IO_URING_REMOVE) { add_poll_remove_sqe(ctx, node); } + if (flags & FDMON_IO_URING_DELETE_AIO_HANDLER) { + /* + * process_cqe() sets this flag after ADD and REMOVE have been + * cleared. They cannot be set again, so they must be clear. + */ + assert(!(flags & FDMON_IO_URING_ADD)); + assert(!(flags & FDMON_IO_URING_REMOVE)); + + QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); + } } } -/* Returns true if a handler became ready */ -static bool process_cqe(AioContext *ctx, - AioHandlerList *ready_list, - struct io_uring_cqe *cqe) +static bool process_cqe_aio_handler(AioContext *ctx, + AioHandlerList *ready_list, + AioHandler *node, + struct io_uring_cqe *cqe) { - AioHandler *node = io_uring_cqe_get_data(cqe); unsigned flags; - /* poll_timeout and poll_remove have a zero user_data field */ - if (!node) { - return false; - } - /* * Deletion can only happen when IORING_OP_POLL_ADD completes. If we race * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE @@ -241,7 +258,12 @@ static bool process_cqe(AioContext *ctx, */ flags = qatomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE); if (flags & FDMON_IO_URING_REMOVE) { - QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); + if (flags & FDMON_IO_URING_PENDING) { + /* Still on ctx->submit_list, defer deletion until fill_sq_ring() */ + qatomic_or(&node->flags, FDMON_IO_URING_DELETE_AIO_HANDLER); + } else { + QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); + } return false; } @@ -252,6 +274,35 @@ static bool process_cqe(AioContext *ctx, return true; } +/* Returns true if a handler became ready */ +static bool process_cqe(AioContext *ctx, + AioHandlerList *ready_list, + struct io_uring_cqe *cqe) +{ + CqeHandler *cqe_handler = io_uring_cqe_get_data(cqe); + + /* poll_timeout and poll_remove have a zero user_data field */ + if (!cqe_handler) { + return false; + } + + /* + * Special handling for AioHandler cqes. They need ready_list and have a + * return value. + */ + if (cqe_handler->cb == fdmon_special_cqe_handler) { + AioHandler *node = container_of(cqe_handler, AioHandler, + internal_cqe_handler); + return process_cqe_aio_handler(ctx, ready_list, node, cqe); + } + + cqe_handler->cqe = *cqe; + + /* Handlers are invoked later by fdmon_io_uring_dispatch() */ + QSIMPLEQ_INSERT_TAIL(&ctx->cqe_handler_ready_list, cqe_handler, next); + return false; +} + static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list) { struct io_uring *ring = &ctx->fdmon_io_uring; @@ -260,6 +311,13 @@ static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list) unsigned num_ready = 0; unsigned head; +#ifdef HAVE_IO_URING_CQ_HAS_OVERFLOW + /* If the CQ overflowed then fetch CQEs with a syscall */ + if (io_uring_cq_has_overflow(ring)) { + io_uring_get_events(ring); + } +#endif + io_uring_for_each_cqe(ring, head, cqe) { if (process_cqe(ctx, ready_list, cqe)) { num_ready++; @@ -272,23 +330,91 @@ static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list) return num_ready; } +/* This is where SQEs are submitted in the glib event loop */ +static void fdmon_io_uring_gsource_prepare(AioContext *ctx) +{ + fill_sq_ring(ctx); + if (io_uring_sq_ready(&ctx->fdmon_io_uring)) { + while (io_uring_submit(&ctx->fdmon_io_uring) == -EINTR) { + /* Keep trying if syscall was interrupted */ + } + } +} + +static bool fdmon_io_uring_gsource_check(AioContext *ctx) +{ + gpointer tag = ctx->io_uring_fd_tag; + return g_source_query_unix_fd(&ctx->source, tag) & G_IO_IN; +} + +/* Dispatch CQE handlers that are ready */ +static bool fdmon_io_uring_dispatch(AioContext *ctx) +{ + CqeHandlerSimpleQ *ready_list = &ctx->cqe_handler_ready_list; + bool progress = false; + + /* Handlers may use defer_call() to coalesce frequent operations */ + defer_call_begin(); + + while (!QSIMPLEQ_EMPTY(ready_list)) { + CqeHandler *cqe_handler = QSIMPLEQ_FIRST(ready_list); + + QSIMPLEQ_REMOVE_HEAD(ready_list, next); + + trace_fdmon_io_uring_cqe_handler(ctx, cqe_handler, + cqe_handler->cqe.res); + cqe_handler->cb(cqe_handler); + progress = true; + } + + defer_call_end(); + + return progress; +} + + +/* This is where CQEs are processed in the glib event loop */ +static void fdmon_io_uring_gsource_dispatch(AioContext *ctx, + AioHandlerList *ready_list) +{ + process_cq_ring(ctx, ready_list); +} + static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout) { + struct __kernel_timespec ts; unsigned wait_nr = 1; /* block until at least one cqe is ready */ int ret; if (timeout == 0) { wait_nr = 0; /* non-blocking */ } else if (timeout > 0) { - add_timeout_sqe(ctx, timeout); + /* Add a timeout that self-cancels when another cqe becomes ready */ + struct io_uring_sqe *sqe; + + ts = (struct __kernel_timespec){ + .tv_sec = timeout / NANOSECONDS_PER_SECOND, + .tv_nsec = timeout % NANOSECONDS_PER_SECOND, + }; + + sqe = get_sqe(ctx); + io_uring_prep_timeout(sqe, &ts, 1, 0); + io_uring_sqe_set_data(sqe, NULL); } fill_sq_ring(ctx); + /* + * Loop to handle signals in both cases: + * 1. If no SQEs were submitted, then -EINTR is returned. + * 2. If SQEs were submitted then the number of SQEs submitted is returned + * rather than -EINTR. + */ do { ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr); - } while (ret == -EINTR); + } while (ret == -EINTR || + (ret >= 0 && wait_nr > io_uring_cq_ready(&ctx->fdmon_io_uring))); assert(ret >= 0); @@ -319,43 +445,66 @@ static const FDMonOps fdmon_io_uring_ops = { .update = fdmon_io_uring_update, .wait = fdmon_io_uring_wait, .need_wait = fdmon_io_uring_need_wait, + .dispatch = fdmon_io_uring_dispatch, + .gsource_prepare = fdmon_io_uring_gsource_prepare, + .gsource_check = fdmon_io_uring_gsource_check, + .gsource_dispatch = fdmon_io_uring_gsource_dispatch, + .add_sqe = fdmon_io_uring_add_sqe, }; -bool fdmon_io_uring_setup(AioContext *ctx) +bool fdmon_io_uring_setup(AioContext *ctx, Error **errp) { int ret; + ctx->io_uring_fd_tag = NULL; + ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0); if (ret != 0) { + error_setg_errno(errp, -ret, "Failed to initialize io_uring"); return false; } QSLIST_INIT(&ctx->submit_list); + QSIMPLEQ_INIT(&ctx->cqe_handler_ready_list); ctx->fdmon_ops = &fdmon_io_uring_ops; + ctx->io_uring_fd_tag = g_source_add_unix_fd(&ctx->source, + ctx->fdmon_io_uring.ring_fd, G_IO_IN); return true; } void fdmon_io_uring_destroy(AioContext *ctx) { - if (ctx->fdmon_ops == &fdmon_io_uring_ops) { - AioHandler *node; + AioHandler *node; - io_uring_queue_exit(&ctx->fdmon_io_uring); + if (ctx->fdmon_ops != &fdmon_io_uring_ops) { + return; + } - /* Move handlers due to be removed onto the deleted list */ - while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) { - unsigned flags = qatomic_fetch_and(&node->flags, - ~(FDMON_IO_URING_PENDING | - FDMON_IO_URING_ADD | - FDMON_IO_URING_REMOVE)); + io_uring_queue_exit(&ctx->fdmon_io_uring); - if (flags & FDMON_IO_URING_REMOVE) { - QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); - } + /* Move handlers due to be removed onto the deleted list */ + while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) { + unsigned flags = qatomic_fetch_and(&node->flags, + ~(FDMON_IO_URING_PENDING | + FDMON_IO_URING_ADD | + FDMON_IO_URING_REMOVE | + FDMON_IO_URING_DELETE_AIO_HANDLER)); - QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted); + if ((flags & FDMON_IO_URING_REMOVE) || + (flags & FDMON_IO_URING_DELETE_AIO_HANDLER)) { + QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, + node, node_deleted); } - ctx->fdmon_ops = &fdmon_poll_ops; + QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted); } + + g_source_remove_unix_fd(&ctx->source, ctx->io_uring_fd_tag); + ctx->io_uring_fd_tag = NULL; + + assert(QSIMPLEQ_EMPTY(&ctx->cqe_handler_ready_list)); + + qemu_lockcnt_lock(&ctx->list_lock); + fdmon_poll_downgrade(ctx); + qemu_lockcnt_unlock(&ctx->list_lock); } diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c index 17df917cf9..0ae755cc13 100644 --- a/util/fdmon-poll.c +++ b/util/fdmon-poll.c @@ -72,6 +72,11 @@ static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list, /* epoll(7) is faster above a certain number of fds */ if (fdmon_epoll_try_upgrade(ctx, npfd)) { + QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { + if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events) { + g_source_remove_poll(&ctx->source, &node->pfd); + } + } npfd = 0; /* we won't need pollfds[], reset npfd */ return ctx->fdmon_ops->wait(ctx, ready_list, timeout); } @@ -97,11 +102,89 @@ static void fdmon_poll_update(AioContext *ctx, AioHandler *old_node, AioHandler *new_node) { - /* Do nothing, AioHandler already contains the state we'll need */ + if (old_node) { + /* + * If the GSource is in the process of being destroyed then + * g_source_remove_poll() causes an assertion failure. Skip removal in + * that case, because glib cleans up its state during destruction + * anyway. + */ + if (!g_source_is_destroyed(&ctx->source)) { + g_source_remove_poll(&ctx->source, &old_node->pfd); + } + } + + if (new_node) { + g_source_add_poll(&ctx->source, &new_node->pfd); + } +} + +static void fdmon_poll_gsource_prepare(AioContext *ctx) +{ + /* Do nothing */ +} + +static bool fdmon_poll_gsource_check(AioContext *ctx) +{ + AioHandler *node; + bool result = false; + + /* + * We have to walk very carefully in case aio_set_fd_handler is + * called while we're walking. + */ + qemu_lockcnt_inc(&ctx->list_lock); + + QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { + int revents = node->pfd.revents & node->pfd.events; + + if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read) { + result = true; + break; + } + if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write) { + result = true; + break; + } + } + + qemu_lockcnt_dec(&ctx->list_lock); + + return result; +} + +static void fdmon_poll_gsource_dispatch(AioContext *ctx, + AioHandlerList *ready_list) +{ + AioHandler *node; + + QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { + int revents = node->pfd.revents; + + if (revents) { + aio_add_ready_handler(ready_list, node, revents); + } + } } const FDMonOps fdmon_poll_ops = { .update = fdmon_poll_update, .wait = fdmon_poll_wait, .need_wait = aio_poll_disabled, + .gsource_prepare = fdmon_poll_gsource_prepare, + .gsource_check = fdmon_poll_gsource_check, + .gsource_dispatch = fdmon_poll_gsource_dispatch, }; + +void fdmon_poll_downgrade(AioContext *ctx) +{ + AioHandler *node; + + ctx->fdmon_ops = &fdmon_poll_ops; + + QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { + if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events) { + g_source_add_poll(&ctx->source, &node->pfd); + } + } +} diff --git a/util/trace-events b/util/trace-events index bd8f25fb59..540d662507 100644 --- a/util/trace-events +++ b/util/trace-events @@ -24,6 +24,10 @@ buffer_move_empty(const char *buf, size_t len, const char *from) "%s: %zd bytes buffer_move(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s" buffer_free(const char *buf, size_t len) "%s: capacity %zd" +# fdmon-io_uring.c +fdmon_io_uring_add_sqe(void *ctx, void *opaque, int opcode, int fd, uint64_t off, void *cqe_handler) "ctx %p opaque %p opcode %d fd %d off %"PRId64" cqe_handler %p" +fdmon_io_uring_cqe_handler(void *ctx, void *cqe_handler, int cqe_res) "ctx %p cqe_handler %p cqe_res %d" + # filemonitor-inotify.c qemu_file_monitor_add_watch(void *mon, const char *dirpath, const char *filename, void *cb, void *opaque, int64_t id) "File monitor %p add watch dir='%s' file='%s' cb=%p opaque=%p id=%" PRId64 qemu_file_monitor_remove_watch(void *mon, const char *dirpath, int64_t id) "File monitor %p remove watch dir='%s' id=%" PRId64