Block layer patches

- stream: Fix potential crash during job completion
 - aio: add the aio_add_sqe() io_uring API
 - qcow2: put discards in discard queue when discard-no-unref is enabled
 - qcow2, vmdk: Restrict creation with secondary file using protocol
 - qemu-img rebase: Fix assertion failure due to exceeding IO_BUF_SIZE
 - iotests: Run iotests with sanitizers
 - iotests: Add more image formats to the thorough testing
 - iotests: Improve the dry run list to speed up thorough testing
 - Code cleanup
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEE3D3rFZqa+V09dFb+fwmycsiPL9YFAmkTqWcRHGt3b2xmQHJl
 ZGhhdC5jb20ACgkQfwmycsiPL9awPg//VqEgqYbEr3dVUvBFk8tlcewoo7KGICVk
 4kddOwMJIdcsVpiLuNzqQARH2kHV93Hiv+mVt25o00PkJx565eCGTh/bBFas3UXL
 JMBjgHyJutGr4cijkNrnQgqWfeTgc32xdVEWh1nZM2K7LslzC9I1PfUzfxRMYqZA
 Em0KE3vwQDC7xtIyk4t451hkfcQY8fwN9bDMpD+zbzaLsYTEyOJ900En88iW7oHE
 TuJhrviin11jdQCA26QVNXRaw7iIVVo8vJP1VEgbn31iY+Qpcr/HcQRs0x2gex67
 OqIdh4onqkdGCFDxTGUoAH+jORXWUmk/JipIhl9pJP0ZDyAjsm97ThJ6SvctURsK
 UMU0dzXEc1C5spD2CWnN0PujqHYQqYaylx7MdiCJMjaCfDB3ZeIRsTGoiLMB24P+
 WBrcn2P+f03nC/sVvxRZWrpyI2kZwEh1RsO/mnLQ3apVBFeKqaFi8Ouo9oi1ZMd6
 ahUw7sZSoTxmGY1FhOSRCGEh2Wjy0ZIOx9tHT1U9vig5Kf9KeE81yO8yaq2T60mq
 9eaUL8rcUrKRiJw9NUkcEYmIUJrh0nUe/kK2RWmbEGMYIH7ASrGqiyUP5FxpekD+
 i/uen4BeyRwe6rnPOzGolg+HMysMBr8VD/8PwJ8g88FLH1jIdTYvFUdRbrkciUlo
 okC+y4+kqiU=
 =SI8s
 -----END PGP SIGNATURE-----

Merge tag 'for-upstream' of https://repo.or.cz/qemu/kevin into staging

Block layer patches

- stream: Fix potential crash during job completion
- aio: add the aio_add_sqe() io_uring API
- qcow2: put discards in discard queue when discard-no-unref is enabled
- qcow2, vmdk: Restrict creation with secondary file using protocol
- qemu-img rebase: Fix assertion failure due to exceeding IO_BUF_SIZE
- iotests: Run iotests with sanitizers
- iotests: Add more image formats to the thorough testing
- iotests: Improve the dry run list to speed up thorough testing
- Code cleanup

# -----BEGIN PGP SIGNATURE-----
#
# iQJFBAABCgAvFiEE3D3rFZqa+V09dFb+fwmycsiPL9YFAmkTqWcRHGt3b2xmQHJl
# ZGhhdC5jb20ACgkQfwmycsiPL9awPg//VqEgqYbEr3dVUvBFk8tlcewoo7KGICVk
# 4kddOwMJIdcsVpiLuNzqQARH2kHV93Hiv+mVt25o00PkJx565eCGTh/bBFas3UXL
# JMBjgHyJutGr4cijkNrnQgqWfeTgc32xdVEWh1nZM2K7LslzC9I1PfUzfxRMYqZA
# Em0KE3vwQDC7xtIyk4t451hkfcQY8fwN9bDMpD+zbzaLsYTEyOJ900En88iW7oHE
# TuJhrviin11jdQCA26QVNXRaw7iIVVo8vJP1VEgbn31iY+Qpcr/HcQRs0x2gex67
# OqIdh4onqkdGCFDxTGUoAH+jORXWUmk/JipIhl9pJP0ZDyAjsm97ThJ6SvctURsK
# UMU0dzXEc1C5spD2CWnN0PujqHYQqYaylx7MdiCJMjaCfDB3ZeIRsTGoiLMB24P+
# WBrcn2P+f03nC/sVvxRZWrpyI2kZwEh1RsO/mnLQ3apVBFeKqaFi8Ouo9oi1ZMd6
# ahUw7sZSoTxmGY1FhOSRCGEh2Wjy0ZIOx9tHT1U9vig5Kf9KeE81yO8yaq2T60mq
# 9eaUL8rcUrKRiJw9NUkcEYmIUJrh0nUe/kK2RWmbEGMYIH7ASrGqiyUP5FxpekD+
# i/uen4BeyRwe6rnPOzGolg+HMysMBr8VD/8PwJ8g88FLH1jIdTYvFUdRbrkciUlo
# okC+y4+kqiU=
# =SI8s
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 11 Nov 2025 10:23:51 PM CET
# gpg:                using RSA key DC3DEB159A9AF95D3D7456FE7F09B272C88F2FD6
# gpg:                issuer "kwolf@redhat.com"
# gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>" [unknown]
# gpg: WARNING: The key's User ID is not certified with a trusted signature!
# gpg:          There is no indication that the signature belongs to the owner.
# Primary key fingerprint: DC3D EB15 9A9A F95D 3D74  56FE 7F09 B272 C88F 2FD6

* tag 'for-upstream' of https://repo.or.cz/qemu/kevin: (28 commits)
  qemu-img rebase: don't exceed IO_BUF_SIZE in one operation
  qcow2, vmdk: Restrict creation with secondary file using protocol
  block: Allow drivers to control protocol prefix at creation
  tests/qemu-iotest: Add more image formats to the thorough testing
  tests/qemu-iotests: Improve the dry run list to speed up thorough testing
  tests/qemu-iotests/184: Fix skip message for qemu-img without throttle
  qcow2: put discards in discard queue when discard-no-unref is enabled
  qcow2: rename update_refcount_discard to queue_discard
  iotests: Run iotests with sanitizers
  qemu-img: Fix amend option parse error handling
  iotests: Test resizing file node under raw with size/offset
  block: Drop detach_subchain for bdrv_replace_node
  block: replace TABs with space
  block/io_uring: use non-vectored read/write when possible
  block/io_uring: use aio_add_sqe()
  aio-posix: add aio_add_sqe() API for user-defined io_uring requests
  aio-posix: add fdmon_ops->dispatch()
  aio-posix: unindent fdmon_io_uring_destroy()
  aio-posix: gracefully handle io_uring_queue_init() failure
  aio: add errp argument to aio_context_setup()
  ...

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
Richard Henderson 2025-11-12 11:47:42 +01:00
commit 9febfa94b6
47 changed files with 1040 additions and 805 deletions

42
block.c
View file

@ -693,7 +693,7 @@ out:
}
int coroutine_fn bdrv_co_create_file(const char *filename, QemuOpts *opts,
Error **errp)
bool allow_protocol_prefix, Error **errp)
{
QemuOpts *protocol_opts;
BlockDriver *drv;
@ -702,7 +702,7 @@ int coroutine_fn bdrv_co_create_file(const char *filename, QemuOpts *opts,
GLOBAL_STATE_CODE();
drv = bdrv_find_protocol(filename, true, errp);
drv = bdrv_find_protocol(filename, allow_protocol_prefix, errp);
if (drv == NULL) {
return -ENOENT;
}
@ -5398,17 +5398,13 @@ bdrv_replace_node_noperm(BlockDriverState *from,
*
* With auto_skip=false the error is returned if from has a parent which should
* not be updated.
*
* With @detach_subchain=true @to must be in a backing chain of @from. In this
* case backing link of the cow-parent of @to is removed.
*/
static int GRAPH_WRLOCK
bdrv_replace_node_common(BlockDriverState *from, BlockDriverState *to,
bool auto_skip, bool detach_subchain, Error **errp)
bool auto_skip, Error **errp)
{
Transaction *tran = tran_new();
g_autoptr(GSList) refresh_list = NULL;
BlockDriverState *to_cow_parent = NULL;
int ret;
GLOBAL_STATE_CODE();
@ -5417,17 +5413,6 @@ bdrv_replace_node_common(BlockDriverState *from, BlockDriverState *to,
assert(to->quiesce_counter);
assert(bdrv_get_aio_context(from) == bdrv_get_aio_context(to));
if (detach_subchain) {
assert(bdrv_chain_contains(from, to));
assert(from != to);
for (to_cow_parent = from;
bdrv_filter_or_cow_bs(to_cow_parent) != to;
to_cow_parent = bdrv_filter_or_cow_bs(to_cow_parent))
{
;
}
}
/*
* Do the replacement without permission update.
* Replacement may influence the permissions, we should calculate new
@ -5439,11 +5424,6 @@ bdrv_replace_node_common(BlockDriverState *from, BlockDriverState *to,
goto out;
}
if (detach_subchain) {
/* to_cow_parent is already drained because from is drained */
bdrv_remove_child(bdrv_filter_or_cow_child(to_cow_parent), tran);
}
refresh_list = g_slist_prepend(refresh_list, to);
refresh_list = g_slist_prepend(refresh_list, from);
@ -5462,7 +5442,7 @@ out:
int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
Error **errp)
{
return bdrv_replace_node_common(from, to, true, false, errp);
return bdrv_replace_node_common(from, to, true, errp);
}
int bdrv_drop_filter(BlockDriverState *bs, Error **errp)
@ -5478,7 +5458,7 @@ int bdrv_drop_filter(BlockDriverState *bs, Error **errp)
bdrv_drained_begin(child_bs);
bdrv_graph_wrlock();
ret = bdrv_replace_node_common(bs, child_bs, true, true, errp);
ret = bdrv_replace_node_common(bs, child_bs, true, errp);
bdrv_graph_wrunlock();
bdrv_drained_end(child_bs);
@ -5929,17 +5909,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
updated_children = g_slist_prepend(updated_children, c);
}
/*
* It seems correct to pass detach_subchain=true here, but it triggers
* one more yet not fixed bug, when due to nested aio_poll loop we switch to
* another drained section, which modify the graph (for example, removing
* the child, which we keep in updated_children list). So, it's a TODO.
*
* Note, bug triggered if pass detach_subchain=true here and run
* test-bdrv-drain. test_drop_intermediate_poll() test-case will crash.
* That's a FIXME.
*/
bdrv_replace_node_common(top, base, false, false, &local_err);
bdrv_replace_node_common(top, base, false, &local_err);
bdrv_graph_wrunlock();
if (local_err) {

View file

@ -300,15 +300,15 @@ static void bochs_close(BlockDriverState *bs)
}
static BlockDriver bdrv_bochs = {
.format_name = "bochs",
.instance_size = sizeof(BDRVBochsState),
.bdrv_probe = bochs_probe,
.bdrv_open = bochs_open,
.format_name = "bochs",
.instance_size = sizeof(BDRVBochsState),
.bdrv_probe = bochs_probe,
.bdrv_open = bochs_open,
.bdrv_child_perm = bdrv_default_perms,
.bdrv_refresh_limits = bochs_refresh_limits,
.bdrv_co_preadv = bochs_co_preadv,
.bdrv_close = bochs_close,
.is_format = true,
.bdrv_co_preadv = bochs_co_preadv,
.bdrv_close = bochs_close,
.is_format = true,
};
static void bdrv_bochs_init(void)

View file

@ -835,7 +835,7 @@ block_crypto_co_create_opts_luks(BlockDriver *drv, const char *filename,
}
/* Create protocol layer */
ret = bdrv_co_create_file(filename, opts, errp);
ret = bdrv_co_create_file(filename, opts, true, errp);
if (ret < 0) {
goto fail;
}

View file

@ -133,7 +133,7 @@
#define FTYPE_FILE 0
#define FTYPE_CD 1
#define MAX_BLOCKSIZE 4096
#define MAX_BLOCKSIZE 4096
/* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
* leaving a few more bytes for its future use. */
@ -755,14 +755,23 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
}
#endif /* !defined(CONFIG_LINUX_AIO) */
#ifndef CONFIG_LINUX_IO_URING
if (s->use_linux_io_uring) {
#ifdef CONFIG_LINUX_IO_URING
if (!aio_has_io_uring()) {
error_setg(errp, "aio=io_uring was specified, but is not "
"available (disabled via io_uring_disabled "
"sysctl or blocked by container runtime "
"seccomp policy?)");
ret = -EINVAL;
goto fail;
}
#else
error_setg(errp, "aio=io_uring was specified, but is not supported "
"in this build.");
"in this build");
ret = -EINVAL;
goto fail;
}
#endif /* !defined(CONFIG_LINUX_IO_URING) */
}
s->has_discard = true;
s->has_write_zeroes = true;
@ -2522,27 +2531,6 @@ static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
return true;
}
#ifdef CONFIG_LINUX_IO_URING
static inline bool raw_check_linux_io_uring(BDRVRawState *s)
{
Error *local_err = NULL;
AioContext *ctx;
if (!s->use_linux_io_uring) {
return false;
}
ctx = qemu_get_current_aio_context();
if (unlikely(!aio_setup_linux_io_uring(ctx, &local_err))) {
error_reportf_err(local_err, "Unable to use linux io_uring, "
"falling back to thread pool: ");
s->use_linux_io_uring = false;
return false;
}
return true;
}
#endif
#ifdef CONFIG_LINUX_AIO
static inline bool raw_check_linux_aio(BDRVRawState *s)
{
@ -2595,7 +2583,7 @@ raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr, uint64_t bytes,
if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
type |= QEMU_AIO_MISALIGNED;
#ifdef CONFIG_LINUX_IO_URING
} else if (raw_check_linux_io_uring(s)) {
} else if (s->use_linux_io_uring) {
assert(qiov->size == bytes);
ret = luring_co_submit(bs, s->fd, offset, qiov, type, flags);
goto out;
@ -2692,7 +2680,7 @@ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
};
#ifdef CONFIG_LINUX_IO_URING
if (raw_check_linux_io_uring(s)) {
if (s->use_linux_io_uring) {
return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH, 0);
}
#endif
@ -4574,20 +4562,20 @@ static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked)
}
static BlockDriver bdrv_host_cdrom = {
.format_name = "host_cdrom",
.protocol_name = "host_cdrom",
.instance_size = sizeof(BDRVRawState),
.bdrv_needs_filename = true,
.bdrv_probe_device = cdrom_probe_device,
.bdrv_parse_filename = cdrom_parse_filename,
.bdrv_open = cdrom_open,
.bdrv_close = raw_close,
.bdrv_reopen_prepare = raw_reopen_prepare,
.bdrv_reopen_commit = raw_reopen_commit,
.bdrv_reopen_abort = raw_reopen_abort,
.bdrv_co_create_opts = bdrv_co_create_opts_simple,
.create_opts = &bdrv_create_opts_simple,
.mutable_opts = mutable_opts,
.format_name = "host_cdrom",
.protocol_name = "host_cdrom",
.instance_size = sizeof(BDRVRawState),
.bdrv_needs_filename = true,
.bdrv_probe_device = cdrom_probe_device,
.bdrv_parse_filename = cdrom_parse_filename,
.bdrv_open = cdrom_open,
.bdrv_close = raw_close,
.bdrv_reopen_prepare = raw_reopen_prepare,
.bdrv_reopen_commit = raw_reopen_commit,
.bdrv_reopen_abort = raw_reopen_abort,
.bdrv_co_create_opts = bdrv_co_create_opts_simple,
.create_opts = &bdrv_create_opts_simple,
.mutable_opts = mutable_opts,
.bdrv_co_invalidate_cache = raw_co_invalidate_cache,
.bdrv_co_preadv = raw_co_preadv,
@ -4700,20 +4688,20 @@ static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked)
}
static BlockDriver bdrv_host_cdrom = {
.format_name = "host_cdrom",
.protocol_name = "host_cdrom",
.instance_size = sizeof(BDRVRawState),
.bdrv_needs_filename = true,
.bdrv_probe_device = cdrom_probe_device,
.bdrv_parse_filename = cdrom_parse_filename,
.bdrv_open = cdrom_open,
.bdrv_close = raw_close,
.bdrv_reopen_prepare = raw_reopen_prepare,
.bdrv_reopen_commit = raw_reopen_commit,
.bdrv_reopen_abort = raw_reopen_abort,
.bdrv_co_create_opts = bdrv_co_create_opts_simple,
.create_opts = &bdrv_create_opts_simple,
.mutable_opts = mutable_opts,
.format_name = "host_cdrom",
.protocol_name = "host_cdrom",
.instance_size = sizeof(BDRVRawState),
.bdrv_needs_filename = true,
.bdrv_probe_device = cdrom_probe_device,
.bdrv_parse_filename = cdrom_parse_filename,
.bdrv_open = cdrom_open,
.bdrv_close = raw_close,
.bdrv_reopen_prepare = raw_reopen_prepare,
.bdrv_reopen_commit = raw_reopen_commit,
.bdrv_reopen_abort = raw_reopen_abort,
.bdrv_co_create_opts = bdrv_co_create_opts_simple,
.create_opts = &bdrv_create_opts_simple,
.mutable_opts = mutable_opts,
.bdrv_co_preadv = raw_co_preadv,
.bdrv_co_pwritev = raw_co_pwritev,

View file

@ -741,16 +741,16 @@ static QemuOptsList raw_create_opts = {
};
BlockDriver bdrv_file = {
.format_name = "file",
.protocol_name = "file",
.instance_size = sizeof(BDRVRawState),
.bdrv_needs_filename = true,
.bdrv_parse_filename = raw_parse_filename,
.bdrv_open = raw_open,
.bdrv_refresh_limits = raw_probe_alignment,
.bdrv_close = raw_close,
.bdrv_co_create_opts = raw_co_create_opts,
.bdrv_has_zero_init = bdrv_has_zero_init_1,
.format_name = "file",
.protocol_name = "file",
.instance_size = sizeof(BDRVRawState),
.bdrv_needs_filename = true,
.bdrv_parse_filename = raw_parse_filename,
.bdrv_open = raw_open,
.bdrv_refresh_limits = raw_probe_alignment,
.bdrv_close = raw_close,
.bdrv_co_create_opts = raw_co_create_opts,
.bdrv_has_zero_init = bdrv_has_zero_init_1,
.bdrv_reopen_prepare = raw_reopen_prepare,
.bdrv_reopen_commit = raw_reopen_commit,
@ -914,15 +914,15 @@ done:
}
static BlockDriver bdrv_host_device = {
.format_name = "host_device",
.protocol_name = "host_device",
.instance_size = sizeof(BDRVRawState),
.bdrv_needs_filename = true,
.bdrv_parse_filename = hdev_parse_filename,
.bdrv_probe_device = hdev_probe_device,
.bdrv_open = hdev_open,
.bdrv_close = raw_close,
.bdrv_refresh_limits = hdev_refresh_limits,
.format_name = "host_device",
.protocol_name = "host_device",
.instance_size = sizeof(BDRVRawState),
.bdrv_needs_filename = true,
.bdrv_parse_filename = hdev_parse_filename,
.bdrv_probe_device = hdev_probe_device,
.bdrv_open = hdev_open,
.bdrv_close = raw_close,
.bdrv_refresh_limits = hdev_refresh_limits,
.bdrv_aio_preadv = raw_aio_preadv,
.bdrv_aio_pwritev = raw_aio_pwritev,

View file

@ -11,28 +11,20 @@
#include "qemu/osdep.h"
#include <liburing.h>
#include "block/aio.h"
#include "qemu/queue.h"
#include "block/block.h"
#include "block/raw-aio.h"
#include "qemu/coroutine.h"
#include "qemu/defer-call.h"
#include "qapi/error.h"
#include "system/block-backend.h"
#include "trace.h"
/* Only used for assertions. */
#include "qemu/coroutine_int.h"
/* io_uring ring size */
#define MAX_ENTRIES 128
typedef struct LuringAIOCB {
typedef struct {
Coroutine *co;
struct io_uring_sqe sqeq;
ssize_t ret;
QEMUIOVector *qiov;
bool is_read;
QSIMPLEQ_ENTRY(LuringAIOCB) next;
uint64_t offset;
ssize_t ret;
int type;
int fd;
BdrvRequestFlags flags;
/*
* Buffered reads may require resubmission, see
@ -40,36 +32,69 @@ typedef struct LuringAIOCB {
*/
int total_read;
QEMUIOVector resubmit_qiov;
} LuringAIOCB;
typedef struct LuringQueue {
unsigned int in_queue;
unsigned int in_flight;
bool blocked;
QSIMPLEQ_HEAD(, LuringAIOCB) submit_queue;
} LuringQueue;
CqeHandler cqe_handler;
} LuringRequest;
struct LuringState {
AioContext *aio_context;
struct io_uring ring;
/* No locking required, only accessed from AioContext home thread */
LuringQueue io_q;
QEMUBH *completion_bh;
};
/**
* luring_resubmit:
*
* Resubmit a request by appending it to submit_queue. The caller must ensure
* that ioq_submit() is called later so that submit_queue requests are started.
*/
static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb)
static void luring_prep_sqe(struct io_uring_sqe *sqe, void *opaque)
{
QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
s->io_q.in_queue++;
LuringRequest *req = opaque;
QEMUIOVector *qiov = req->qiov;
uint64_t offset = req->offset;
int fd = req->fd;
BdrvRequestFlags flags = req->flags;
switch (req->type) {
case QEMU_AIO_WRITE:
{
int luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
if (luring_flags != 0 || qiov->niov > 1) {
#ifdef HAVE_IO_URING_PREP_WRITEV2
io_uring_prep_writev2(sqe, fd, qiov->iov,
qiov->niov, offset, luring_flags);
#else
/*
* FUA should only be enabled with HAVE_IO_URING_PREP_WRITEV2, see
* luring_has_fua().
*/
assert(luring_flags == 0);
io_uring_prep_writev(sqe, fd, qiov->iov, qiov->niov, offset);
#endif
} else {
/* The man page says non-vectored is faster than vectored */
struct iovec *iov = qiov->iov;
io_uring_prep_write(sqe, fd, iov->iov_base, iov->iov_len, offset);
}
break;
}
case QEMU_AIO_ZONE_APPEND:
io_uring_prep_writev(sqe, fd, qiov->iov, qiov->niov, offset);
break;
case QEMU_AIO_READ:
{
if (req->resubmit_qiov.iov != NULL) {
qiov = &req->resubmit_qiov;
}
if (qiov->niov > 1) {
io_uring_prep_readv(sqe, fd, qiov->iov, qiov->niov,
offset + req->total_read);
} else {
/* The man page says non-vectored is faster than vectored */
struct iovec *iov = qiov->iov;
io_uring_prep_read(sqe, fd, iov->iov_base, iov->iov_len,
offset + req->total_read);
}
break;
}
case QEMU_AIO_FLUSH:
io_uring_prep_fsync(sqe, fd, IORING_FSYNC_DATASYNC);
break;
default:
fprintf(stderr, "%s: invalid AIO request type, aborting 0x%x.\n",
__func__, req->type);
abort();
}
}
/**
@ -78,385 +103,115 @@ static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb)
* Short reads are rare but may occur. The remaining read request needs to be
* resubmitted.
*/
static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb,
int nread)
static void luring_resubmit_short_read(LuringRequest *req, int nread)
{
QEMUIOVector *resubmit_qiov;
size_t remaining;
trace_luring_resubmit_short_read(s, luringcb, nread);
trace_luring_resubmit_short_read(req, nread);
/* Update read position */
luringcb->total_read += nread;
remaining = luringcb->qiov->size - luringcb->total_read;
req->total_read += nread;
remaining = req->qiov->size - req->total_read;
/* Shorten qiov */
resubmit_qiov = &luringcb->resubmit_qiov;
resubmit_qiov = &req->resubmit_qiov;
if (resubmit_qiov->iov == NULL) {
qemu_iovec_init(resubmit_qiov, luringcb->qiov->niov);
qemu_iovec_init(resubmit_qiov, req->qiov->niov);
} else {
qemu_iovec_reset(resubmit_qiov);
}
qemu_iovec_concat(resubmit_qiov, luringcb->qiov, luringcb->total_read,
remaining);
qemu_iovec_concat(resubmit_qiov, req->qiov, req->total_read, remaining);
/* Update sqe */
luringcb->sqeq.off += nread;
luringcb->sqeq.addr = (uintptr_t)luringcb->resubmit_qiov.iov;
luringcb->sqeq.len = luringcb->resubmit_qiov.niov;
luring_resubmit(s, luringcb);
aio_add_sqe(luring_prep_sqe, req, &req->cqe_handler);
}
/**
* luring_process_completions:
* @s: AIO state
*
* Fetches completed I/O requests, consumes cqes and invokes their callbacks
* The function is somewhat tricky because it supports nested event loops, for
* example when a request callback invokes aio_poll().
*
* Function schedules BH completion so it can be called again in a nested
* event loop. When there are no events left to complete the BH is being
* canceled.
*
*/
static void luring_process_completions(LuringState *s)
static void luring_cqe_handler(CqeHandler *cqe_handler)
{
struct io_uring_cqe *cqes;
int total_bytes;
LuringRequest *req = container_of(cqe_handler, LuringRequest, cqe_handler);
int ret = cqe_handler->cqe.res;
defer_call_begin();
trace_luring_cqe_handler(req, ret);
/*
* Request completion callbacks can run the nested event loop.
* Schedule ourselves so the nested event loop will "see" remaining
* completed requests and process them. Without this, completion
* callbacks that wait for other requests using a nested event loop
* would hang forever.
*
* This workaround is needed because io_uring uses poll_wait, which
* is woken up when new events are added to the uring, thus polling on
* the same uring fd will block unless more events are received.
*
* Other leaf block drivers (drivers that access the data themselves)
* are networking based, so they poll sockets for data and run the
* correct coroutine.
*/
qemu_bh_schedule(s->completion_bh);
while (io_uring_peek_cqe(&s->ring, &cqes) == 0) {
LuringAIOCB *luringcb;
int ret;
if (!cqes) {
break;
if (ret < 0) {
/*
* Only writev/readv/fsync requests on regular files or host block
* devices are submitted. Therefore -EAGAIN is not expected but it's
* known to happen sometimes with Linux SCSI. Submit again and hope
* the request completes successfully.
*
* For more information, see:
* https://lore.kernel.org/io-uring/20210727165811.284510-3-axboe@kernel.dk/T/#u
*
* If the code is changed to submit other types of requests in the
* future, then this workaround may need to be extended to deal with
* genuine -EAGAIN results that should not be resubmitted
* immediately.
*/
if (ret == -EINTR || ret == -EAGAIN) {
aio_add_sqe(luring_prep_sqe, req, &req->cqe_handler);
return;
}
luringcb = io_uring_cqe_get_data(cqes);
ret = cqes->res;
io_uring_cqe_seen(&s->ring, cqes);
cqes = NULL;
/* Change counters one-by-one because we can be nested. */
s->io_q.in_flight--;
trace_luring_process_completion(s, luringcb, ret);
} else if (req->qiov) {
/* total_read is non-zero only for resubmitted read requests */
total_bytes = ret + luringcb->total_read;
int total_bytes = ret + req->total_read;
if (ret < 0) {
/*
* Only writev/readv/fsync requests on regular files or host block
* devices are submitted. Therefore -EAGAIN is not expected but it's
* known to happen sometimes with Linux SCSI. Submit again and hope
* the request completes successfully.
*
* For more information, see:
* https://lore.kernel.org/io-uring/20210727165811.284510-3-axboe@kernel.dk/T/#u
*
* If the code is changed to submit other types of requests in the
* future, then this workaround may need to be extended to deal with
* genuine -EAGAIN results that should not be resubmitted
* immediately.
*/
if (ret == -EINTR || ret == -EAGAIN) {
luring_resubmit(s, luringcb);
continue;
}
} else if (!luringcb->qiov) {
goto end;
} else if (total_bytes == luringcb->qiov->size) {
if (total_bytes == req->qiov->size) {
ret = 0;
/* Only read/write */
} else {
/* Short Read/Write */
if (luringcb->is_read) {
if (req->type == QEMU_AIO_READ) {
if (ret > 0) {
luring_resubmit_short_read(s, luringcb, ret);
continue;
} else {
/* Pad with zeroes */
qemu_iovec_memset(luringcb->qiov, total_bytes, 0,
luringcb->qiov->size - total_bytes);
ret = 0;
luring_resubmit_short_read(req, ret);
return;
}
/* Pad with zeroes */
qemu_iovec_memset(req->qiov, total_bytes, 0,
req->qiov->size - total_bytes);
ret = 0;
} else {
ret = -ENOSPC;
}
}
end:
luringcb->ret = ret;
qemu_iovec_destroy(&luringcb->resubmit_qiov);
/*
* If the coroutine is already entered it must be in ioq_submit()
* and will notice luringcb->ret has been filled in when it
* eventually runs later. Coroutines cannot be entered recursively
* so avoid doing that!
*/
assert(luringcb->co->ctx == s->aio_context);
if (!qemu_coroutine_entered(luringcb->co)) {
aio_co_wake(luringcb->co);
}
}
qemu_bh_cancel(s->completion_bh);
req->ret = ret;
qemu_iovec_destroy(&req->resubmit_qiov);
defer_call_end();
}
static int ioq_submit(LuringState *s)
{
int ret = 0;
LuringAIOCB *luringcb, *luringcb_next;
while (s->io_q.in_queue > 0) {
/*
* Try to fetch sqes from the ring for requests waiting in
* the overflow queue
*/
QSIMPLEQ_FOREACH_SAFE(luringcb, &s->io_q.submit_queue, next,
luringcb_next) {
struct io_uring_sqe *sqes = io_uring_get_sqe(&s->ring);
if (!sqes) {
break;
}
/* Prep sqe for submission */
*sqes = luringcb->sqeq;
QSIMPLEQ_REMOVE_HEAD(&s->io_q.submit_queue, next);
}
ret = io_uring_submit(&s->ring);
trace_luring_io_uring_submit(s, ret);
/* Prevent infinite loop if submission is refused */
if (ret <= 0) {
if (ret == -EAGAIN || ret == -EINTR) {
continue;
}
break;
}
s->io_q.in_flight += ret;
s->io_q.in_queue -= ret;
}
s->io_q.blocked = (s->io_q.in_queue > 0);
if (s->io_q.in_flight) {
/*
* We can try to complete something just right away if there are
* still requests in-flight.
*/
luring_process_completions(s);
}
return ret;
}
static void luring_process_completions_and_submit(LuringState *s)
{
luring_process_completions(s);
if (s->io_q.in_queue > 0) {
ioq_submit(s);
/*
* If the coroutine is already entered it must be in luring_co_submit() and
* will notice req->ret has been filled in when it eventually runs later.
* Coroutines cannot be entered recursively so avoid doing that!
*/
if (!qemu_coroutine_entered(req->co)) {
aio_co_wake(req->co);
}
}
static void qemu_luring_completion_bh(void *opaque)
int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd,
uint64_t offset, QEMUIOVector *qiov,
int type, BdrvRequestFlags flags)
{
LuringState *s = opaque;
luring_process_completions_and_submit(s);
}
static void qemu_luring_completion_cb(void *opaque)
{
LuringState *s = opaque;
luring_process_completions_and_submit(s);
}
static bool qemu_luring_poll_cb(void *opaque)
{
LuringState *s = opaque;
return io_uring_cq_ready(&s->ring);
}
static void qemu_luring_poll_ready(void *opaque)
{
LuringState *s = opaque;
luring_process_completions_and_submit(s);
}
static void ioq_init(LuringQueue *io_q)
{
QSIMPLEQ_INIT(&io_q->submit_queue);
io_q->in_queue = 0;
io_q->in_flight = 0;
io_q->blocked = false;
}
static void luring_deferred_fn(void *opaque)
{
LuringState *s = opaque;
trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue,
s->io_q.in_flight);
if (!s->io_q.blocked && s->io_q.in_queue > 0) {
ioq_submit(s);
}
}
/**
* luring_do_submit:
* @fd: file descriptor for I/O
* @luringcb: AIO control block
* @s: AIO state
* @offset: offset for request
* @type: type of request
*
* Fetches sqes from ring, adds to pending queue and preps them
*
*/
static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
uint64_t offset, int type, BdrvRequestFlags flags)
{
int ret;
struct io_uring_sqe *sqes = &luringcb->sqeq;
switch (type) {
case QEMU_AIO_WRITE:
#ifdef HAVE_IO_URING_PREP_WRITEV2
{
int luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
io_uring_prep_writev2(sqes, fd, luringcb->qiov->iov,
luringcb->qiov->niov, offset, luring_flags);
}
#else
assert(flags == 0);
io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
luringcb->qiov->niov, offset);
#endif
break;
case QEMU_AIO_ZONE_APPEND:
io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
luringcb->qiov->niov, offset);
break;
case QEMU_AIO_READ:
io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
luringcb->qiov->niov, offset);
break;
case QEMU_AIO_FLUSH:
io_uring_prep_fsync(sqes, fd, IORING_FSYNC_DATASYNC);
break;
default:
fprintf(stderr, "%s: invalid AIO request type, aborting 0x%x.\n",
__func__, type);
abort();
}
io_uring_sqe_set_data(sqes, luringcb);
QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
s->io_q.in_queue++;
trace_luring_do_submit(s, s->io_q.blocked, s->io_q.in_queue,
s->io_q.in_flight);
if (!s->io_q.blocked) {
if (s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES) {
ret = ioq_submit(s);
trace_luring_do_submit_done(s, ret);
return ret;
}
defer_call(luring_deferred_fn, s);
}
return 0;
}
int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
QEMUIOVector *qiov, int type,
BdrvRequestFlags flags)
{
int ret;
AioContext *ctx = qemu_get_current_aio_context();
LuringState *s = aio_get_linux_io_uring(ctx);
LuringAIOCB luringcb = {
LuringRequest req = {
.co = qemu_coroutine_self(),
.ret = -EINPROGRESS,
.qiov = qiov,
.is_read = (type == QEMU_AIO_READ),
.ret = -EINPROGRESS,
.type = type,
.fd = fd,
.offset = offset,
.flags = flags,
};
trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0,
type);
ret = luring_do_submit(fd, &luringcb, s, offset, type, flags);
if (ret < 0) {
return ret;
}
req.cqe_handler.cb = luring_cqe_handler;
if (luringcb.ret == -EINPROGRESS) {
trace_luring_co_submit(bs, &req, fd, offset, qiov ? qiov->size : 0, type);
aio_add_sqe(luring_prep_sqe, &req, &req.cqe_handler);
if (req.ret == -EINPROGRESS) {
qemu_coroutine_yield();
}
return luringcb.ret;
}
void luring_detach_aio_context(LuringState *s, AioContext *old_context)
{
aio_set_fd_handler(old_context, s->ring.ring_fd,
NULL, NULL, NULL, NULL, s);
qemu_bh_delete(s->completion_bh);
s->aio_context = NULL;
}
void luring_attach_aio_context(LuringState *s, AioContext *new_context)
{
s->aio_context = new_context;
s->completion_bh = aio_bh_new(new_context, qemu_luring_completion_bh, s);
aio_set_fd_handler(s->aio_context, s->ring.ring_fd,
qemu_luring_completion_cb, NULL,
qemu_luring_poll_cb, qemu_luring_poll_ready, s);
}
LuringState *luring_init(Error **errp)
{
int rc;
LuringState *s = g_new0(LuringState, 1);
struct io_uring *ring = &s->ring;
trace_luring_init_state(s, sizeof(*s));
rc = io_uring_queue_init(MAX_ENTRIES, ring, 0);
if (rc < 0) {
error_setg_errno(errp, -rc, "failed to init linux io_uring ring");
g_free(s);
return NULL;
}
ioq_init(&s->io_q);
return s;
}
void luring_cleanup(LuringState *s)
{
io_uring_queue_exit(&s->ring);
trace_luring_cleanup_state(s);
g_free(s);
return req.ret;
}
bool luring_has_fua(void)

View file

@ -1117,7 +1117,7 @@ parallels_co_create_opts(BlockDriver *drv, const char *filename,
}
/* Create and open the file (protocol layer) */
ret = bdrv_co_create_file(filename, opts, errp);
ret = bdrv_co_create_file(filename, opts, true, errp);
if (ret < 0) {
goto done;
}

View file

@ -978,7 +978,7 @@ qcow_co_create_opts(BlockDriver *drv, const char *filename,
}
/* Create and open the file (protocol layer) */
ret = bdrv_co_create_file(filename, opts, errp);
ret = bdrv_co_create_file(filename, opts, true, errp);
if (ret < 0) {
goto fail;
}
@ -1184,11 +1184,11 @@ static const char *const qcow_strong_runtime_opts[] = {
};
static BlockDriver bdrv_qcow = {
.format_name = "qcow",
.instance_size = sizeof(BDRVQcowState),
.bdrv_probe = qcow_probe,
.bdrv_open = qcow_open,
.bdrv_close = qcow_close,
.format_name = "qcow",
.instance_size = sizeof(BDRVQcowState),
.bdrv_probe = qcow_probe,
.bdrv_open = qcow_open,
.bdrv_close = qcow_close,
.bdrv_child_perm = bdrv_default_perms,
.bdrv_reopen_prepare = qcow_reopen_prepare,
.bdrv_co_create = qcow_co_create,

View file

@ -1978,12 +1978,10 @@ discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters,
if (!keep_reference) {
/* Then decrease the refcount */
qcow2_free_any_cluster(bs, old_l2_entry, type);
} else if (s->discard_passthrough[type] &&
(cluster_type == QCOW2_CLUSTER_NORMAL ||
cluster_type == QCOW2_CLUSTER_ZERO_ALLOC)) {
} else {
/* If we keep the reference, pass on the discard still */
bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
s->cluster_size);
qcow2_discard_cluster(bs, old_l2_entry & L2E_OFFSET_MASK,
s->cluster_size, cluster_type, type);
}
}
@ -2092,12 +2090,10 @@ zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
if (!keep_reference) {
/* Then decrease the refcount */
qcow2_free_any_cluster(bs, old_l2_entry, QCOW2_DISCARD_REQUEST);
} else if (s->discard_passthrough[QCOW2_DISCARD_REQUEST] &&
(type == QCOW2_CLUSTER_NORMAL ||
type == QCOW2_CLUSTER_ZERO_ALLOC)) {
} else {
/* If we keep the reference, pass on the discard still */
bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
s->cluster_size);
qcow2_discard_cluster(bs, old_l2_entry & L2E_OFFSET_MASK,
s->cluster_size, type, QCOW2_DISCARD_REQUEST);
}
}
}

View file

@ -754,8 +754,8 @@ void qcow2_process_discards(BlockDriverState *bs, int ret)
}
}
static void update_refcount_discard(BlockDriverState *bs,
uint64_t offset, uint64_t length)
static void queue_discard(BlockDriverState *bs,
uint64_t offset, uint64_t length)
{
BDRVQcow2State *s = bs->opaque;
Qcow2DiscardRegion *d, *p, *next;
@ -902,7 +902,7 @@ update_refcount(BlockDriverState *bs, int64_t offset, int64_t length,
}
if (s->discard_passthrough[type]) {
update_refcount_discard(bs, cluster_offset, s->cluster_size);
queue_discard(bs, cluster_offset, s->cluster_size);
}
}
}
@ -1205,6 +1205,23 @@ void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
}
}
void qcow2_discard_cluster(BlockDriverState *bs, uint64_t offset,
uint64_t length, QCow2ClusterType ctype,
enum qcow2_discard_type dtype)
{
BDRVQcow2State *s = bs->opaque;
if (s->discard_passthrough[dtype] &&
(ctype == QCOW2_CLUSTER_NORMAL ||
ctype == QCOW2_CLUSTER_ZERO_ALLOC)) {
if (has_data_file(bs)) {
bdrv_pdiscard(s->data_file, offset, length);
} else {
queue_discard(bs, offset, length);
}
}
}
int qcow2_write_caches(BlockDriverState *bs)
{
BDRVQcow2State *s = bs->opaque;
@ -3619,7 +3636,7 @@ qcow2_discard_refcount_block(BlockDriverState *bs, uint64_t discard_block_offs)
/* discard refblock from the cache if refblock is cached */
qcow2_cache_discard(s->refcount_block_cache, refblock);
}
update_refcount_discard(bs, discard_block_offs, s->cluster_size);
queue_discard(bs, discard_block_offs, s->cluster_size);
return 0;
}

View file

@ -3956,7 +3956,7 @@ qcow2_co_create_opts(BlockDriver *drv, const char *filename, QemuOpts *opts,
}
/* Create and open the file (protocol layer) */
ret = bdrv_co_create_file(filename, opts, errp);
ret = bdrv_co_create_file(filename, opts, true, errp);
if (ret < 0) {
goto finish;
}
@ -3971,7 +3971,7 @@ qcow2_co_create_opts(BlockDriver *drv, const char *filename, QemuOpts *opts,
/* Create and open an external data file (protocol layer) */
val = qdict_get_try_str(qdict, BLOCK_OPT_DATA_FILE);
if (val) {
ret = bdrv_co_create_file(val, opts, errp);
ret = bdrv_co_create_file(val, opts, false, errp);
if (ret < 0) {
goto finish;
}

View file

@ -880,6 +880,10 @@ void GRAPH_RDLOCK qcow2_free_clusters(BlockDriverState *bs,
void GRAPH_RDLOCK
qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
enum qcow2_discard_type type);
void GRAPH_RDLOCK
qcow2_discard_cluster(BlockDriverState *bs, uint64_t offset,
uint64_t length, QCow2ClusterType ctype,
enum qcow2_discard_type dtype);
int GRAPH_RDLOCK
qcow2_update_snapshot_refcount(BlockDriverState *bs, int64_t l1_table_offset,

View file

@ -788,7 +788,7 @@ bdrv_qed_co_create_opts(BlockDriver *drv, const char *filename,
}
/* Create and open the file (protocol layer) */
ret = bdrv_co_create_file(filename, opts, errp);
ret = bdrv_co_create_file(filename, opts, true, errp);
if (ret < 0) {
goto fail;
}

View file

@ -463,7 +463,7 @@ static int coroutine_fn GRAPH_UNLOCKED
raw_co_create_opts(BlockDriver *drv, const char *filename,
QemuOpts *opts, Error **errp)
{
return bdrv_co_create_file(filename, opts, errp);
return bdrv_co_create_file(filename, opts, true, errp);
}
static int raw_open(BlockDriverState *bs, QDict *options, int flags,

View file

@ -62,15 +62,9 @@ qmp_block_stream(void *bs) "bs %p"
file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d"
# io_uring.c
luring_init_state(void *s, size_t size) "s %p size %zu"
luring_cleanup_state(void *s) "%p freed"
luring_unplug_fn(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
luring_do_submit(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
luring_do_submit_done(void *s, int ret) "LuringState %p submitted to kernel %d"
luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, size_t nbytes, int type) "bs %p s %p luringcb %p fd %d offset %" PRId64 " nbytes %zd type %d"
luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d"
luring_io_uring_submit(void *s, int ret) "LuringState %p ret %d"
luring_resubmit_short_read(void *s, void *luringcb, int nread) "LuringState %p luringcb %p nread %d"
luring_cqe_handler(void *req, int ret) "req %p ret %d"
luring_co_submit(void *bs, void *req, int fd, uint64_t offset, size_t nbytes, int type) "bs %p req %p fd %d offset %" PRId64 " nbytes %zd type %d"
luring_resubmit_short_read(void *req, int nread) "req %p nread %d"
# qcow2.c
qcow2_add_task(void *co, void *bs, void *pool, const char *action, int cluster_type, uint64_t host_offset, uint64_t offset, uint64_t bytes, void *qiov, size_t qiov_offset) "co %p bs %p pool %p: %s: cluster_type %d file_cluster_offset %" PRIu64 " offset %" PRIu64 " bytes %" PRIu64 " qiov %p qiov_offset %zu"

View file

@ -938,7 +938,7 @@ vdi_co_create_opts(BlockDriver *drv, const char *filename,
qdict = qemu_opts_to_qdict_filtered(opts, NULL, &vdi_create_opts, true);
/* Create and open the file (protocol layer) */
ret = bdrv_co_create_file(filename, opts, errp);
ret = bdrv_co_create_file(filename, opts, true, errp);
if (ret < 0) {
goto done;
}

View file

@ -2096,7 +2096,7 @@ vhdx_co_create_opts(BlockDriver *drv, const char *filename,
}
/* Create and open the file (protocol layer) */
ret = bdrv_co_create_file(filename, opts, errp);
ret = bdrv_co_create_file(filename, opts, true, errp);
if (ret < 0) {
goto fail;
}

View file

@ -2334,7 +2334,7 @@ vmdk_create_extent(const char *filename, int64_t filesize, bool flat,
int ret;
BlockBackend *blk = NULL;
ret = bdrv_co_create_file(filename, opts, errp);
ret = bdrv_co_create_file(filename, opts, false, errp);
if (ret < 0) {
goto exit;
}

View file

@ -1118,7 +1118,7 @@ vpc_co_create_opts(BlockDriver *drv, const char *filename,
}
/* Create and open the file (protocol layer) */
ret = bdrv_co_create_file(filename, opts, errp);
ret = bdrv_co_create_file(filename, opts, true, errp);
if (ret < 0) {
goto fail;
}

View file

@ -61,6 +61,27 @@ typedef struct LuringState LuringState;
/* Is polling disabled? */
bool aio_poll_disabled(AioContext *ctx);
#ifdef CONFIG_LINUX_IO_URING
/*
* Each io_uring request must have a unique CqeHandler that processes the cqe.
* The lifetime of a CqeHandler must be at least from aio_add_sqe() until
* ->cb() invocation.
*/
typedef struct CqeHandler CqeHandler;
struct CqeHandler {
/* Called by the AioContext when the request has completed */
void (*cb)(CqeHandler *handler);
/* Used internally, do not access this */
QSIMPLEQ_ENTRY(CqeHandler) next;
/* This field is filled in before ->cb() is called */
struct io_uring_cqe cqe;
};
typedef QSIMPLEQ_HEAD(, CqeHandler) CqeHandlerSimpleQ;
#endif /* CONFIG_LINUX_IO_URING */
/* Callbacks for file descriptor monitoring implementations */
typedef struct {
/*
@ -106,6 +127,78 @@ typedef struct {
* Returns: true if ->wait() should be called, false otherwise.
*/
bool (*need_wait)(AioContext *ctx);
/*
* dispatch:
* @ctx: the AioContext
*
* Dispatch any work that is specific to this file descriptor monitoring
* implementation. Usually the event loop's generic file descriptor
* monitoring, BH, and timer dispatching code is sufficient, but file
* descriptor monitoring implementations offering additional functionality
* may need to implement this function for custom behavior. Called at a
* point in the event loop when it is safe to invoke user-defined
* callbacks.
*
* This function is optional and may be NULL.
*
* Returns: true if progress was made (see aio_poll()'s return value),
* false otherwise.
*/
bool (*dispatch)(AioContext *ctx);
/*
* gsource_prepare:
* @ctx: the AioContext
*
* Prepare for the glib event loop to wait for events instead of the usual
* ->wait() call. See glib's GSourceFuncs->prepare().
*/
void (*gsource_prepare)(AioContext *ctx);
/*
* gsource_check:
* @ctx: the AioContext
*
* Called by the glib event loop from glib's GSourceFuncs->check() after
* waiting for events.
*
* Returns: true when ready to be dispatched.
*/
bool (*gsource_check)(AioContext *ctx);
/*
* gsource_dispatch:
* @ctx: the AioContext
* @ready_list: list for handlers that become ready
*
* Place ready AioHandlers on ready_list. Called as part of the glib event
* loop from glib's GSourceFuncs->dispatch().
*
* Called with list_lock incremented.
*/
void (*gsource_dispatch)(AioContext *ctx, AioHandlerList *ready_list);
#ifdef CONFIG_LINUX_IO_URING
/**
* add_sqe: Add an io_uring sqe for submission.
* @prep_sqe: invoked with an sqe that should be prepared for submission
* @opaque: user-defined argument to @prep_sqe()
* @cqe_handler: the unique cqe handler associated with this request
*
* The caller's @prep_sqe() function is invoked to fill in the details of
* the sqe. Do not call io_uring_sqe_set_data() on this sqe.
*
* The kernel may see the sqe as soon as @prep_sqe() returns or it may take
* until the next event loop iteration.
*
* This function is called from the current AioContext and is not
* thread-safe.
*/
void (*add_sqe)(AioContext *ctx,
void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
void *opaque, CqeHandler *cqe_handler);
#endif /* CONFIG_LINUX_IO_URING */
} FDMonOps;
/*
@ -217,12 +310,14 @@ struct AioContext {
struct LinuxAioState *linux_aio;
#endif
#ifdef CONFIG_LINUX_IO_URING
LuringState *linux_io_uring;
/* State for file descriptor monitoring using Linux io_uring */
struct io_uring fdmon_io_uring;
AioHandlerSList submit_list;
#endif
void *io_uring_fd_tag;
/* Pending callback state for cqe handlers */
CqeHandlerSimpleQ cqe_handler_ready_list;
#endif /* CONFIG_LINUX_IO_URING */
/* TimerLists for calling timers - one per clock type. Has its own
* locking.
@ -254,7 +349,13 @@ struct AioContext {
/* epoll(7) state used when built with CONFIG_EPOLL */
int epollfd;
/* The GSource unix fd tag for epollfd */
void *epollfd_tag;
const FDMonOps *fdmon_ops;
/* Was aio_context_new() successful? */
bool initialized;
};
/**
@ -512,11 +613,6 @@ struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp);
/* Return the LinuxAioState bound to this AioContext */
struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);
/* Setup the LuringState bound to this AioContext */
LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp);
/* Return the LuringState bound to this AioContext */
LuringState *aio_get_linux_io_uring(AioContext *ctx);
/**
* aio_timer_new_with_attrs:
* @ctx: the aio context
@ -679,10 +775,13 @@ void qemu_set_current_aio_context(AioContext *ctx);
/**
* aio_context_setup:
* @ctx: the aio context
* @errp: error pointer
*
* Initialize the aio context.
*
* Returns: true on success, false otherwise
*/
void aio_context_setup(AioContext *ctx);
bool aio_context_setup(AioContext *ctx, Error **errp);
/**
* aio_context_destroy:
@ -692,9 +791,6 @@ void aio_context_setup(AioContext *ctx);
*/
void aio_context_destroy(AioContext *ctx);
/* Used internally, do not call outside AioContext code */
void aio_context_use_g_source(AioContext *ctx);
/**
* aio_context_set_poll_params:
* @ctx: the aio context
@ -724,4 +820,40 @@ void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch);
*/
void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
int64_t max, Error **errp);
#ifdef CONFIG_LINUX_IO_URING
/**
* aio_has_io_uring: Return whether io_uring is available.
*
* io_uring is either available in all AioContexts or in none, so this only
* needs to be called once from within any thread's AioContext.
*/
static inline bool aio_has_io_uring(void)
{
AioContext *ctx = qemu_get_current_aio_context();
return ctx->fdmon_ops->add_sqe;
}
/**
* aio_add_sqe: Add an io_uring sqe for submission.
* @prep_sqe: invoked with an sqe that should be prepared for submission
* @opaque: user-defined argument to @prep_sqe()
* @cqe_handler: the unique cqe handler associated with this request
*
* The caller's @prep_sqe() function is invoked to fill in the details of the
* sqe. Do not call io_uring_sqe_set_data() on this sqe.
*
* The sqe is submitted by the current AioContext. The kernel may see the sqe
* as soon as @prep_sqe() returns or it may take until the next event loop
* iteration.
*
* When the AioContext is destroyed, pending sqes are ignored and their
* CqeHandlers are not invoked.
*
* This function must be called only when aio_has_io_uring() returns true.
*/
void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
void *opaque, CqeHandler *cqe_handler);
#endif /* CONFIG_LINUX_IO_URING */
#endif

View file

@ -65,7 +65,8 @@ int co_wrapper bdrv_create(BlockDriver *drv, const char *filename,
QemuOpts *opts, Error **errp);
int coroutine_fn GRAPH_UNLOCKED
bdrv_co_create_file(const char *filename, QemuOpts *opts, Error **errp);
bdrv_co_create_file(const char *filename, QemuOpts *opts,
bool allow_protocol_prefix, Error **errp);
BlockDriverState *bdrv_new(void);
int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,

View file

@ -296,7 +296,7 @@ enum {
NBD_CMD_BLOCK_STATUS = 7,
};
#define NBD_DEFAULT_PORT 10809
#define NBD_DEFAULT_PORT 10809
/* Maximum size of a single READ/WRITE data buffer */
#define NBD_MAX_BUFFER_SIZE (32 * 1024 * 1024)

View file

@ -74,15 +74,10 @@ static inline bool laio_has_fua(void)
#endif
/* io_uring.c - Linux io_uring implementation */
#ifdef CONFIG_LINUX_IO_URING
LuringState *luring_init(Error **errp);
void luring_cleanup(LuringState *s);
/* luring_co_submit: submit I/O requests in the thread's current AioContext. */
int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
QEMUIOVector *qiov, int type,
BdrvRequestFlags flags);
void luring_detach_aio_context(LuringState *s, AioContext *old_context);
void luring_attach_aio_context(LuringState *s, AioContext *new_context);
bool luring_has_fua(void);
#else
static inline bool luring_has_fua(void)

View file

@ -2745,6 +2745,8 @@ endif
if linux_io_uring.found()
config_host_data.set('HAVE_IO_URING_PREP_WRITEV2',
cc.has_header_symbol('liburing.h', 'io_uring_prep_writev2'))
config_host_data.set('HAVE_IO_URING_CQ_HAS_OVERFLOW',
cc.has_header_symbol('liburing.h', 'io_uring_cq_has_overflow'))
endif
config_host_data.set('HAVE_TCP_KEEPCNT',
cc.has_header_symbol('netinet/tcp.h', 'TCP_KEEPCNT') or

View file

@ -4081,7 +4081,7 @@ static int img_rebase(const img_cmd_t *ccmd, int argc, char **argv)
n += offset - QEMU_ALIGN_DOWN(offset, write_align);
offset = QEMU_ALIGN_DOWN(offset, write_align);
n += QEMU_ALIGN_UP(offset + n, write_align) - (offset + n);
n = MIN(n, size - offset);
n = MIN(n, MIN(size - offset, IO_BUF_SIZE));
assert(!bdrv_is_allocated(unfiltered_bs, offset, n, &n_alloc) &&
n_alloc == n);
@ -4597,9 +4597,9 @@ static int img_amend(const img_cmd_t *ccmd, int argc, char **argv)
amend_opts = qemu_opts_append(amend_opts, bs->drv->amend_opts);
opts = qemu_opts_create(amend_opts, NULL, 0, &error_abort);
if (!qemu_opts_do_parse(opts, options, NULL, &err)) {
qemu_opts_del(opts);
/* Try to parse options using the create options */
amend_opts = qemu_opts_append(amend_opts, bs->drv->create_opts);
qemu_opts_del(opts);
opts = qemu_opts_create(amend_opts, NULL, 0, &error_abort);
if (qemu_opts_do_parse(opts, options, NULL, NULL)) {
error_append_hint(&err,

View file

@ -1,32 +0,0 @@
/*
* Linux io_uring support.
*
* Copyright (C) 2009 IBM, Corp.
* Copyright (C) 2009 Red Hat, Inc.
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*/
#include "qemu/osdep.h"
#include "block/aio.h"
#include "block/raw-aio.h"
void luring_detach_aio_context(LuringState *s, AioContext *old_context)
{
abort();
}
void luring_attach_aio_context(LuringState *s, AioContext *new_context)
{
abort();
}
LuringState *luring_init(Error **errp)
{
abort();
}
void luring_cleanup(LuringState *s)
{
abort();
}

View file

@ -32,9 +32,6 @@ if have_block or have_ga
stub_ss.add(files('cpus-virtual-clock.c'))
stub_ss.add(files('icount.c'))
stub_ss.add(files('graph-lock.c'))
if linux_io_uring.found()
stub_ss.add(files('io_uring.c'))
endif
if libaio.found()
stub_ss.add(files('linux-aio.c'))
endif

View file

@ -315,6 +315,52 @@ echo
$QEMU_IMG map "$OVERLAY" | _filter_qemu_img_map
# Check that the region to copy to the overlay during a rebase
# operation does not exceed the I/O buffer size.
#
# backing_new <-- backing_old <-- overlay
#
# Backing (new): -- -- -- -- <-- Empty image, size 4MB
# Backing (old):|--|ff|ff|--| <-- 4 clusters, 1MB each
# Overlay: |-- --|-- --| <-- 2 clusters, 2MB each
#
# The data at [1MB, 3MB) must be copied from the old backing image to
# the overlay. However the rebase code will extend that region to the
# overlay's (sub)cluster boundaries to avoid CoW (see commit 12df580b).
# This test checks that IO_BUF_SIZE (2 MB) is taken into account.
echo
echo "=== Test that the region to copy does not exceed 2MB (IO_BUF_SIZE) ==="
echo
echo "Creating backing chain"
echo
TEST_IMG=$BASE_NEW _make_test_img 4M
TEST_IMG=$BASE_OLD CLUSTER_SIZE=1M _make_test_img -b "$BASE_NEW" -F $IMGFMT
TEST_IMG=$OVERLAY CLUSTER_SIZE=2M _make_test_img -b "$BASE_OLD" -F $IMGFMT
echo
echo "Writing data to region [1MB, 3MB)"
echo
$QEMU_IO "$BASE_OLD" -c "write -P 0xff 1M 2M" | _filter_qemu_io
echo
echo "Rebasing"
echo
$QEMU_IMG rebase -b "$BASE_NEW" -F $IMGFMT "$OVERLAY"
echo "Verifying the data"
echo
$QEMU_IO "$OVERLAY" -c "read -P 0x00 0 1M" | _filter_qemu_io
$QEMU_IO "$OVERLAY" -c "read -P 0xff 1M 2M" | _filter_qemu_io
$QEMU_IO "$OVERLAY" -c "read -P 0x00 3M 1M" | _filter_qemu_io
$QEMU_IMG map "$OVERLAY" | _filter_qemu_img_map
echo
# success, all done

View file

@ -243,4 +243,30 @@ Offset Length File
0 0x20000 TEST_DIR/subdir/t.IMGFMT
0x40000 0x20000 TEST_DIR/subdir/t.IMGFMT
=== Test that the region to copy does not exceed 2MB (IO_BUF_SIZE) ===
Creating backing chain
Formatting 'TEST_DIR/subdir/t.IMGFMT.base_new', fmt=IMGFMT size=4194304
Formatting 'TEST_DIR/subdir/t.IMGFMT.base_old', fmt=IMGFMT size=4194304 backing_file=TEST_DIR/subdir/t.IMGFMT.base_new backing_fmt=IMGFMT
Formatting 'TEST_DIR/subdir/t.IMGFMT', fmt=IMGFMT size=4194304 backing_file=TEST_DIR/subdir/t.IMGFMT.base_old backing_fmt=IMGFMT
Writing data to region [1MB, 3MB)
wrote 2097152/2097152 bytes at offset 1048576
2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
Rebasing
Verifying the data
read 1048576/1048576 bytes at offset 0
1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 2097152/2097152 bytes at offset 1048576
2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 1048576/1048576 bytes at offset 3145728
1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
Offset Length File
0 0x400000 TEST_DIR/subdir/t.IMGFMT
*** done

View file

@ -51,7 +51,7 @@ run_qemu()
}
test_throttle=$($QEMU_IMG --help|grep throttle)
[ "$test_throttle" = "" ] && _supported_fmt throttle
[ "$test_throttle" = "" ] && _notrun "qemu-img does not support throttle"
echo
echo "== checking interface =="

View file

@ -310,14 +310,18 @@ def test_bitmap_sync(bsync_mode, msync_mode='bitmap', failure=None):
'state': 1,
'new_state': 2
}, {
'event': 'read_aio',
'event': 'flush_to_disk',
'state': 2,
'new_state': 3
}, {
'event': "read_aio",
'state': 3,
'new_state': 4
}],
'inject-error': [{
'event': 'read_aio',
'errno': 5,
'state': 3,
'state': 4,
'immediately': False,
'once': True
}]

View file

@ -272,7 +272,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK!
--- Preparing image & VM ---
{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}}
{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}}
{"return": {}}
--- Write #0 ---
@ -1017,7 +1017,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK!
--- Preparing image & VM ---
{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}}
{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}}
{"return": {}}
--- Write #0 ---
@ -1762,7 +1762,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK!
--- Preparing image & VM ---
{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}}
{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}}
{"return": {}}
--- Write #0 ---
@ -2507,7 +2507,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK!
--- Preparing image & VM ---
{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}}
{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}}
{"return": {}}
--- Write #0 ---
@ -3252,7 +3252,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK!
--- Preparing image & VM ---
{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}}
{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}}
{"return": {}}
--- Write #0 ---
@ -3997,7 +3997,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK!
--- Preparing image & VM ---
{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}}
{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}}
{"return": {}}
--- Write #0 ---
@ -4742,7 +4742,7 @@ qemu_img compare "TEST_DIR/PID-img" "TEST_DIR/PID-fbackup2" ==> Identical, OK!
--- Preparing image & VM ---
{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 3}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "read_aio", "new-state": 3, "state": 2}]}, "node-name": "drive0"}}
{"execute": "blockdev-add", "arguments": {"driver": "qcow2", "file": {"driver": "blkdebug", "image": {"driver": "file", "filename": "TEST_DIR/PID-img"}, "inject-error": [{"errno": 5, "event": "read_aio", "immediately": false, "once": true, "state": 4}], "set-state": [{"event": "flush_to_disk", "new-state": 2, "state": 1}, {"event": "flush_to_disk", "new-state": 3, "state": 2}, {"event": "read_aio", "new-state": 4, "state": 3}]}, "node-name": "drive0"}}
{"return": {}}
--- Write #0 ---

View file

@ -17,6 +17,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import re
import sys
import argparse
import shutil
@ -82,7 +83,7 @@ def make_argparser() -> argparse.ArgumentParser:
g_env.add_argument('-i', dest='aiomode', default='threads',
help='sets AIOMODE environment variable')
p.set_defaults(imgfmt='raw', imgproto='file')
p.set_defaults(imgproto='file')
format_list = ['raw', 'bochs', 'cloop', 'parallels', 'qcow', 'qcow2',
'qed', 'vdi', 'vpc', 'vhdx', 'vmdk', 'luks', 'dmg', 'vvfat']
@ -137,15 +138,50 @@ def make_argparser() -> argparse.ArgumentParser:
return p
def dry_run_list(test_dir, imgfmt, testlist):
for t in testlist:
if not imgfmt:
print('\n'.join([os.path.basename(t)]))
continue
# If a format has been given, we look for the "supported_fmt"
# and the "unsupported_fmt" lines in the test and try to find out
# whether the format is supported or not. This is only heuristics
# (it can e.g. fail if the "unsupported_fmts" and "supported_fmts"
# statements are in the same line), but it should be good enough
# to get a proper list for "make check-block"
with open(os.path.join(test_dir, t), 'r', encoding='utf-8') as fh:
supported = True
check_next_line = False
sd = "[ \t'\"]" # Start delimiter
ed = "([ \t'\"]|$)" # End delimiter
for line in fh:
if 'unsupported_fmt' in line:
if re.search(sd + imgfmt + ed, line):
supported = False
break
elif 'supported_fmt' in line or check_next_line:
if re.search(sd + 'generic' + ed, line):
continue # Might be followed by "unsupported" line
supported = re.search(sd + imgfmt + ed, line)
check_next_line = not ']' in line and \
('supported_fmts=[' in line or check_next_line)
if supported or not check_next_line:
break
if supported:
print('\n'.join([os.path.basename(t)]))
if __name__ == '__main__':
warnings.simplefilter("default")
os.environ["PYTHONWARNINGS"] = "default"
args = make_argparser().parse_args()
image_format = args.imgfmt or 'raw'
env = TestEnv(source_dir=args.source_dir,
build_dir=args.build_dir,
imgfmt=args.imgfmt, imgproto=args.imgproto,
imgfmt=image_format, imgproto=args.imgproto,
aiomode=args.aiomode, cachemode=args.cachemode,
imgopts=args.imgopts, misalign=args.misalign,
debug=args.debug, valgrind=args.valgrind,
@ -189,7 +225,7 @@ if __name__ == '__main__':
if args.dry_run:
with env:
print('\n'.join([os.path.basename(t) for t in tests]))
dry_run_list(env.source_iotests, args.imgfmt, tests)
else:
with TestRunner(env, tap=args.tap,
color=args.color) as tr:

View file

@ -2,14 +2,6 @@ if not have_tools or host_os == 'windows'
subdir_done()
endif
foreach cflag: qemu_ldflags
if cflag.startswith('-fsanitize') and \
not cflag.contains('safe-stack') and not cflag.contains('cfi-icall')
message('Sanitizers are enabled ==> Disabled the qemu-iotests.')
subdir_done()
endif
endforeach
bash = find_program('bash', required: false, version: '>= 4.0')
if not bash.found()
message('bash >= v4.0 not available ==> Disabled the qemu-iotests.')
@ -21,7 +13,10 @@ qemu_iotests_env = {'PYTHON': python.full_path()}
qemu_iotests_formats = {
'qcow2': 'quick',
'raw': 'slow',
'parallels': 'thorough',
'qed': 'thorough',
'vdi': 'thorough',
'vhdx': 'thorough',
'vmdk': 'thorough',
'vpc': 'thorough'
}

View file

@ -263,10 +263,21 @@ class TestRunner(contextlib.AbstractContextManager['TestRunner']):
Path(env[d]).mkdir(parents=True, exist_ok=True)
test_dir = env['TEST_DIR']
f_asan = Path(test_dir, f_test.name + '.out.asan')
f_bad = Path(test_dir, f_test.name + '.out.bad')
f_notrun = Path(test_dir, f_test.name + '.notrun')
f_casenotrun = Path(test_dir, f_test.name + '.casenotrun')
env['ASAN_OPTIONS'] = f'detect_leaks=0:log_path={f_asan}'
def unlink_asan():
with os.scandir(test_dir) as it:
for entry in it:
if entry.name.startswith(f_asan.name):
os.unlink(entry)
unlink_asan()
for p in (f_notrun, f_casenotrun):
silent_unlink(p)
@ -312,6 +323,7 @@ class TestRunner(contextlib.AbstractContextManager['TestRunner']):
description=f'output mismatch (see {f_bad})',
diff=diff, casenotrun=casenotrun)
else:
unlink_asan()
f_bad.unlink()
return TestResult(status='pass', elapsed=elapsed,
casenotrun=casenotrun)

View file

@ -8,16 +8,27 @@
# SPDX-License-Identifier: GPL-2.0-or-later
import os
from typing import Dict, Optional
import iotests
from iotests import imgfmt, qemu_img_create, QMPTestCase
image_size = 1 * 1024 * 1024
image = os.path.join(iotests.test_dir, 'test.img')
class TestResizeBelowRaw(QMPTestCase):
class BaseResizeBelowRaw(QMPTestCase):
raw_size: Optional[int] = None
raw_offset: Optional[int] = None
def setUp(self) -> None:
qemu_img_create('-f', imgfmt, image, str(image_size))
extra_options: Dict[str, str] = {}
if self.raw_size is not None:
extra_options['size'] = str(self.raw_size)
if self.raw_offset is not None:
extra_options['offset'] = str(self.raw_offset)
self.vm = iotests.VM()
self.vm.add_blockdev(self.vm.qmp_to_opts({
'driver': imgfmt,
@ -26,7 +37,8 @@ class TestResizeBelowRaw(QMPTestCase):
'driver': 'file',
'filename': image,
'node-name': 'file0',
}
},
**extra_options
}))
self.vm.launch()
@ -34,14 +46,16 @@ class TestResizeBelowRaw(QMPTestCase):
self.vm.shutdown()
os.remove(image)
def assert_size(self, size: int) -> None:
def assert_size(self, size: int, file_size: Optional[int] = None) -> None:
nodes = self.vm.qmp('query-named-block-nodes', flat=True)['return']
self.assertEqual(len(nodes), 2)
for node in nodes:
if node['drv'] == 'file':
if node['drv'] == 'file' and file_size is not None:
self.assertEqual(node['image']['virtual-size'], file_size)
continue
self.assertEqual(node['image']['virtual-size'], size)
class TestResizeBelowUnlimitedRaw(BaseResizeBelowRaw):
def test_resize_below_raw(self) -> None:
self.assert_size(image_size)
self.vm.qmp('block_resize', node_name='file0', size=2*image_size)
@ -49,5 +63,36 @@ class TestResizeBelowRaw(QMPTestCase):
self.vm.qmp('block_resize', node_name='node0', size=3*image_size)
self.assert_size(3*image_size)
# offset = 0 behaves the same as absent offset
class TestResizeBelowRawWithZeroOffset(TestResizeBelowUnlimitedRaw):
raw_offset = 0
class TestResizeBelowRawWithSize(BaseResizeBelowRaw):
raw_size = image_size // 2
def test_resize_below_raw_with_size(self) -> None:
self.assert_size(image_size // 2, image_size)
# This QMP command fails because node0 unshares RESIZE
self.vm.qmp('block_resize', node_name='file0', size=2*image_size)
self.assert_size(image_size // 2, image_size)
# This QMP command fails because node0 is a fixed-size disk
self.vm.qmp('block_resize', node_name='node0', size=3*image_size)
self.assert_size(image_size // 2, image_size)
class TestResizeBelowRawWithOffset(BaseResizeBelowRaw):
raw_offset = image_size // 4
def test_resize_below_raw_with_offset(self) -> None:
self.assert_size(image_size * 3 // 4, image_size)
# This QMP command fails because node0 unshares RESIZE
self.vm.qmp('block_resize', node_name='file0', size=2*image_size)
self.assert_size(image_size * 3 // 4, image_size)
self.vm.qmp('block_resize', node_name='node0', size=3*image_size)
self.assert_size(3 * image_size, image_size * 13 // 4)
if __name__ == '__main__':
iotests.main(supported_fmts=['raw'], supported_protocols=['file'])

View file

@ -1,5 +1,5 @@
.
....
----------------------------------------------------------------------
Ran 1 tests
Ran 4 tests
OK

View file

@ -527,7 +527,12 @@ static void test_source_bh_delete_from_cb(void)
g_assert_cmpint(data1.n, ==, data1.max);
g_assert(data1.bh == NULL);
assert(g_main_context_iteration(NULL, false));
/*
* There may be up to one more iteration due to the aio_notify
* EventNotifier.
*/
g_main_context_iteration(NULL, false);
assert(!g_main_context_iteration(NULL, false));
}

View file

@ -15,6 +15,7 @@
#include "qemu/osdep.h"
#include "block/aio.h"
#include "qapi/error.h"
#include "util/aio-posix.h"
typedef struct {
AioContext *ctx;
@ -71,17 +72,17 @@ static void test(void)
.ctx = aio_context_new(&error_abort),
};
if (td.ctx->fdmon_ops != &fdmon_poll_ops) {
/* This test is tied to fdmon-poll.c */
g_test_skip("fdmon_poll_ops not in use");
return;
}
qemu_set_current_aio_context(td.ctx);
/* Enable polling */
aio_context_set_poll_params(td.ctx, 1000000, 2, 2, &error_abort);
/*
* The GSource is unused but this has the side-effect of changing the fdmon
* that AioContext uses.
*/
aio_get_g_source(td.ctx);
/* Make the event notifier active (set) right away */
event_notifier_init(&td.poll_notifier, 1);
aio_set_event_notifier(td.ctx, &td.poll_notifier,

View file

@ -16,6 +16,7 @@
#include "qemu/osdep.h"
#include "block/block.h"
#include "block/thread-pool.h"
#include "qapi/error.h"
#include "qemu/main-loop.h"
#include "qemu/lockcnt.h"
#include "qemu/rcu.h"
@ -70,15 +71,6 @@ static AioHandler *find_aio_handler(AioContext *ctx, int fd)
static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
{
/* If the GSource is in the process of being destroyed then
* g_source_remove_poll() causes an assertion failure. Skip
* removal in that case, because glib cleans up its state during
* destruction anyway.
*/
if (!g_source_is_destroyed(&ctx->source)) {
g_source_remove_poll(&ctx->source, &node->pfd);
}
node->pfd.revents = 0;
node->poll_ready = false;
@ -153,7 +145,6 @@ void aio_set_fd_handler(AioContext *ctx,
} else {
new_node->pfd = node->pfd;
}
g_source_add_poll(&ctx->source, &new_node->pfd);
new_node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
new_node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
@ -267,37 +258,13 @@ bool aio_prepare(AioContext *ctx)
poll_set_started(ctx, &ready_list, false);
/* TODO what to do with this list? */
ctx->fdmon_ops->gsource_prepare(ctx);
return false;
}
bool aio_pending(AioContext *ctx)
{
AioHandler *node;
bool result = false;
/*
* We have to walk very carefully in case aio_set_fd_handler is
* called while we're walking.
*/
qemu_lockcnt_inc(&ctx->list_lock);
QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
int revents;
/* TODO should this check poll ready? */
revents = node->pfd.revents & node->pfd.events;
if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read) {
result = true;
break;
}
if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write) {
result = true;
break;
}
}
qemu_lockcnt_dec(&ctx->list_lock);
return result;
return ctx->fdmon_ops->gsource_check(ctx);
}
static void aio_free_deleted_handlers(AioContext *ctx)
@ -390,10 +357,6 @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
return progress;
}
/*
* If we have a list of ready handlers then this is more efficient than
* scanning all handlers with aio_dispatch_handlers().
*/
static bool aio_dispatch_ready_handlers(AioContext *ctx,
AioHandlerList *ready_list,
int64_t block_ns)
@ -417,24 +380,23 @@ static bool aio_dispatch_ready_handlers(AioContext *ctx,
return progress;
}
/* Slower than aio_dispatch_ready_handlers() but only used via glib */
static bool aio_dispatch_handlers(AioContext *ctx)
{
AioHandler *node, *tmp;
bool progress = false;
QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
progress = aio_dispatch_handler(ctx, node) || progress;
}
return progress;
}
void aio_dispatch(AioContext *ctx)
{
AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
qemu_lockcnt_inc(&ctx->list_lock);
aio_bh_poll(ctx);
aio_dispatch_handlers(ctx);
ctx->fdmon_ops->gsource_dispatch(ctx, &ready_list);
if (ctx->fdmon_ops->dispatch) {
ctx->fdmon_ops->dispatch(ctx);
}
/* block_ns is 0 because polling is disabled in the glib event loop */
aio_dispatch_ready_handlers(ctx, &ready_list, 0);
aio_free_deleted_handlers(ctx);
qemu_lockcnt_dec(&ctx->list_lock);
@ -559,7 +521,14 @@ static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list,
elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
max_ns = qemu_soonest_timeout(*timeout, max_ns);
assert(!(max_ns && progress));
} while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
if (ctx->fdmon_ops->need_wait(ctx)) {
if (fdmon_supports_polling(ctx)) {
*timeout = 0; /* stay in polling mode */
}
break;
}
} while (elapsed_time < max_ns);
if (remove_idle_poll_handlers(ctx, ready_list,
start_time + elapsed_time)) {
@ -722,7 +691,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
* up IO threads when some work becomes pending. It is essential to
* avoid hangs or unnecessary latency.
*/
if (poll_set_started(ctx, &ready_list, false)) {
if (timeout && poll_set_started(ctx, &ready_list, false)) {
timeout = 0;
progress = true;
}
@ -743,6 +712,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
}
if (ctx->fdmon_ops->dispatch) {
progress |= ctx->fdmon_ops->dispatch(ctx);
}
progress |= aio_bh_poll(ctx);
progress |= aio_dispatch_ready_handlers(ctx, &ready_list, block_ns);
@ -755,35 +728,50 @@ bool aio_poll(AioContext *ctx, bool blocking)
return progress;
}
void aio_context_setup(AioContext *ctx)
bool aio_context_setup(AioContext *ctx, Error **errp)
{
ctx->fdmon_ops = &fdmon_poll_ops;
ctx->epollfd = -1;
ctx->epollfd_tag = NULL;
/* Use the fastest fd monitoring implementation if available */
if (fdmon_io_uring_setup(ctx)) {
return;
#ifdef CONFIG_LINUX_IO_URING
{
static bool need_io_uring;
Error *local_err = NULL; /* ERRP_GUARD() doesn't handle error_abort */
/* io_uring takes precedence because it provides aio_add_sqe() support */
if (fdmon_io_uring_setup(ctx, &local_err)) {
/*
* If one AioContext gets io_uring, then all AioContexts need io_uring
* so that aio_add_sqe() support is available across all threads.
*/
need_io_uring = true;
return true;
}
if (need_io_uring) {
error_propagate(errp, local_err);
return false;
}
/* Silently fall back on systems where io_uring is unavailable */
error_free(local_err);
}
#endif /* CONFIG_LINUX_IO_URING */
fdmon_epoll_setup(ctx);
return true;
}
void aio_context_destroy(AioContext *ctx)
{
#ifdef CONFIG_LINUX_IO_URING
fdmon_io_uring_destroy(ctx);
fdmon_epoll_disable(ctx);
aio_free_deleted_handlers(ctx);
}
#endif
qemu_lockcnt_lock(&ctx->list_lock);
fdmon_epoll_disable(ctx);
qemu_lockcnt_unlock(&ctx->list_lock);
void aio_context_use_g_source(AioContext *ctx)
{
/*
* Disable io_uring when the glib main loop is used because it doesn't
* support mixed glib/aio_poll() usage. It relies on aio_poll() being
* called regularly so that changes to the monitored file descriptors are
* submitted, otherwise a list of pending fd handlers builds up.
*/
fdmon_io_uring_destroy(ctx);
aio_free_deleted_handlers(ctx);
}
@ -818,3 +806,12 @@ void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch)
aio_notify(ctx);
}
#ifdef CONFIG_LINUX_IO_URING
void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
void *opaque, CqeHandler *cqe_handler)
{
AioContext *ctx = qemu_get_current_aio_context();
ctx->fdmon_ops->add_sqe(ctx, prep_sqe, opaque, cqe_handler);
}
#endif /* CONFIG_LINUX_IO_URING */

View file

@ -18,6 +18,7 @@
#define AIO_POSIX_H
#include "block/aio.h"
#include "qapi/error.h"
struct AioHandler {
GPollFD pfd;
@ -35,6 +36,7 @@ struct AioHandler {
#ifdef CONFIG_LINUX_IO_URING
QSLIST_ENTRY(AioHandler) node_submitted;
unsigned flags; /* see fdmon-io_uring.c */
CqeHandler internal_cqe_handler; /* used for POLL_ADD/POLL_REMOVE */
#endif
int64_t poll_idle_timeout; /* when to stop userspace polling */
bool poll_ready; /* has polling detected an event? */
@ -47,9 +49,14 @@ void aio_add_ready_handler(AioHandlerList *ready_list, AioHandler *node,
extern const FDMonOps fdmon_poll_ops;
/* Switch back to poll(2). list_lock must be held. */
void fdmon_poll_downgrade(AioContext *ctx);
#ifdef CONFIG_EPOLL_CREATE1
bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd);
void fdmon_epoll_setup(AioContext *ctx);
/* list_lock must be held */
void fdmon_epoll_disable(AioContext *ctx);
#else
static inline bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
@ -67,17 +74,8 @@ static inline void fdmon_epoll_disable(AioContext *ctx)
#endif /* !CONFIG_EPOLL_CREATE1 */
#ifdef CONFIG_LINUX_IO_URING
bool fdmon_io_uring_setup(AioContext *ctx);
bool fdmon_io_uring_setup(AioContext *ctx, Error **errp);
void fdmon_io_uring_destroy(AioContext *ctx);
#else
static inline bool fdmon_io_uring_setup(AioContext *ctx)
{
return false;
}
static inline void fdmon_io_uring_destroy(AioContext *ctx)
{
}
#endif /* !CONFIG_LINUX_IO_URING */
#endif /* AIO_POSIX_H */

View file

@ -419,18 +419,15 @@ bool aio_poll(AioContext *ctx, bool blocking)
return progress;
}
void aio_context_setup(AioContext *ctx)
bool aio_context_setup(AioContext *ctx, Error **errp)
{
return true;
}
void aio_context_destroy(AioContext *ctx)
{
}
void aio_context_use_g_source(AioContext *ctx)
{
}
void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
int64_t grow, int64_t shrink, Error **errp)
{

View file

@ -366,12 +366,16 @@ aio_ctx_dispatch(GSource *source,
}
static void
aio_ctx_finalize(GSource *source)
aio_ctx_finalize(GSource *source)
{
AioContext *ctx = (AioContext *) source;
QEMUBH *bh;
unsigned flags;
if (!ctx->initialized) {
return;
}
thread_pool_free_aio(ctx->thread_pool);
#ifdef CONFIG_LINUX_AIO
@ -382,14 +386,6 @@ aio_ctx_finalize(GSource *source)
}
#endif
#ifdef CONFIG_LINUX_IO_URING
if (ctx->linux_io_uring) {
luring_detach_aio_context(ctx->linux_io_uring, ctx);
luring_cleanup(ctx->linux_io_uring);
ctx->linux_io_uring = NULL;
}
#endif
assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
qemu_bh_delete(ctx->co_schedule_bh);
@ -418,10 +414,11 @@ aio_ctx_finalize(GSource *source)
aio_set_event_notifier(ctx, &ctx->notifier, NULL, NULL, NULL);
event_notifier_cleanup(&ctx->notifier);
qemu_rec_mutex_destroy(&ctx->lock);
qemu_lockcnt_destroy(&ctx->list_lock);
timerlistgroup_deinit(&ctx->tlg);
unregister_aiocontext(ctx);
aio_context_destroy(ctx);
/* aio_context_destroy() still needs the lock */
qemu_lockcnt_destroy(&ctx->list_lock);
}
static GSourceFuncs aio_source_funcs = {
@ -433,7 +430,6 @@ static GSourceFuncs aio_source_funcs = {
GSource *aio_get_g_source(AioContext *ctx)
{
aio_context_use_g_source(ctx);
g_source_ref(&ctx->source);
return &ctx->source;
}
@ -465,29 +461,6 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx)
}
#endif
#ifdef CONFIG_LINUX_IO_URING
LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp)
{
if (ctx->linux_io_uring) {
return ctx->linux_io_uring;
}
ctx->linux_io_uring = luring_init(errp);
if (!ctx->linux_io_uring) {
return NULL;
}
luring_attach_aio_context(ctx->linux_io_uring, ctx);
return ctx->linux_io_uring;
}
LuringState *aio_get_linux_io_uring(AioContext *ctx)
{
assert(ctx->linux_io_uring);
return ctx->linux_io_uring;
}
#endif
void aio_notify(AioContext *ctx)
{
/*
@ -577,19 +550,42 @@ static void co_schedule_bh_cb(void *opaque)
AioContext *aio_context_new(Error **errp)
{
ERRP_GUARD();
int ret;
AioContext *ctx;
/*
* ctx is freed by g_source_unref() (e.g. aio_context_unref()). ctx's
* resources are freed as follows:
*
* 1. By aio_ctx_finalize() after aio_context_new() has returned and set
* ->initialized = true.
*
* 2. By manual cleanup code in this function's error paths before goto
* fail.
*
* Be careful to free resources in both cases!
*/
ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext));
QSLIST_INIT(&ctx->bh_list);
QSIMPLEQ_INIT(&ctx->bh_slice_list);
aio_context_setup(ctx);
ret = event_notifier_init(&ctx->notifier, false);
if (ret < 0) {
error_setg_errno(errp, -ret, "Failed to initialize event notifier");
goto fail;
}
/*
* Resources cannot easily be freed manually after aio_context_setup(). If
* you add any new resources to AioContext, it's probably best to acquire
* them before aio_context_setup().
*/
if (!aio_context_setup(ctx, errp)) {
event_notifier_cleanup(&ctx->notifier);
goto fail;
}
g_source_set_can_recurse(&ctx->source, true);
qemu_lockcnt_init(&ctx->list_lock);
@ -604,10 +600,6 @@ AioContext *aio_context_new(Error **errp)
ctx->linux_aio = NULL;
#endif
#ifdef CONFIG_LINUX_IO_URING
ctx->linux_io_uring = NULL;
#endif
ctx->thread_pool = NULL;
qemu_rec_mutex_init(&ctx->lock);
timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
@ -623,9 +615,11 @@ AioContext *aio_context_new(Error **errp)
register_aiocontext(ctx);
ctx->initialized = true;
return ctx;
fail:
g_source_destroy(&ctx->source);
g_source_unref(&ctx->source);
return NULL;
}

View file

@ -19,8 +19,12 @@ void fdmon_epoll_disable(AioContext *ctx)
ctx->epollfd = -1;
}
/* Switch back */
ctx->fdmon_ops = &fdmon_poll_ops;
if (ctx->epollfd_tag) {
g_source_remove_unix_fd(&ctx->source, ctx->epollfd_tag);
ctx->epollfd_tag = NULL;
}
fdmon_poll_downgrade(ctx);
}
static inline int epoll_events_from_pfd(int pfd_events)
@ -93,10 +97,29 @@ out:
return ret;
}
static void fdmon_epoll_gsource_prepare(AioContext *ctx)
{
/* Do nothing */
}
static bool fdmon_epoll_gsource_check(AioContext *ctx)
{
return g_source_query_unix_fd(&ctx->source, ctx->epollfd_tag) & G_IO_IN;
}
static void fdmon_epoll_gsource_dispatch(AioContext *ctx,
AioHandlerList *ready_list)
{
fdmon_epoll_wait(ctx, ready_list, 0);
}
static const FDMonOps fdmon_epoll_ops = {
.update = fdmon_epoll_update,
.wait = fdmon_epoll_wait,
.need_wait = aio_poll_disabled,
.gsource_prepare = fdmon_epoll_gsource_prepare,
.gsource_check = fdmon_epoll_gsource_check,
.gsource_dispatch = fdmon_epoll_gsource_dispatch,
};
static bool fdmon_epoll_try_enable(AioContext *ctx)
@ -118,6 +141,8 @@ static bool fdmon_epoll_try_enable(AioContext *ctx)
}
ctx->fdmon_ops = &fdmon_epoll_ops;
ctx->epollfd_tag = g_source_add_unix_fd(&ctx->source, ctx->epollfd,
G_IO_IN);
return true;
}
@ -139,12 +164,11 @@ bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
}
ok = fdmon_epoll_try_enable(ctx);
qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
if (!ok) {
fdmon_epoll_disable(ctx);
}
qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
return ok;
}

View file

@ -45,16 +45,20 @@
#include "qemu/osdep.h"
#include <poll.h>
#include "qapi/error.h"
#include "qemu/defer-call.h"
#include "qemu/rcu_queue.h"
#include "aio-posix.h"
#include "trace.h"
enum {
FDMON_IO_URING_ENTRIES = 128, /* sq/cq ring size */
/* AioHandler::flags */
FDMON_IO_URING_PENDING = (1 << 0),
FDMON_IO_URING_ADD = (1 << 1),
FDMON_IO_URING_REMOVE = (1 << 2),
FDMON_IO_URING_PENDING = (1 << 0),
FDMON_IO_URING_ADD = (1 << 1),
FDMON_IO_URING_REMOVE = (1 << 2),
FDMON_IO_URING_DELETE_AIO_HANDLER = (1 << 3),
};
static inline int poll_events_from_pfd(int pfd_events)
@ -74,8 +78,8 @@ static inline int pfd_events_from_poll(int poll_events)
}
/*
* Returns an sqe for submitting a request. Only be called within
* fdmon_io_uring_wait().
* Returns an sqe for submitting a request. Only called from the AioContext
* thread.
*/
static struct io_uring_sqe *get_sqe(AioContext *ctx)
{
@ -166,41 +170,50 @@ static void fdmon_io_uring_update(AioContext *ctx,
}
}
static void fdmon_io_uring_add_sqe(AioContext *ctx,
void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
void *opaque, CqeHandler *cqe_handler)
{
struct io_uring_sqe *sqe = get_sqe(ctx);
prep_sqe(sqe, opaque);
io_uring_sqe_set_data(sqe, cqe_handler);
trace_fdmon_io_uring_add_sqe(ctx, opaque, sqe->opcode, sqe->fd, sqe->off,
cqe_handler);
}
static void fdmon_special_cqe_handler(CqeHandler *cqe_handler)
{
/*
* This is an empty function that is never called. It is used as a function
* pointer to distinguish it from ordinary cqe handlers.
*/
}
static void add_poll_add_sqe(AioContext *ctx, AioHandler *node)
{
struct io_uring_sqe *sqe = get_sqe(ctx);
int events = poll_events_from_pfd(node->pfd.events);
io_uring_prep_poll_add(sqe, node->pfd.fd, events);
io_uring_sqe_set_data(sqe, node);
node->internal_cqe_handler.cb = fdmon_special_cqe_handler;
io_uring_sqe_set_data(sqe, &node->internal_cqe_handler);
}
static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node)
{
struct io_uring_sqe *sqe = get_sqe(ctx);
CqeHandler *cqe_handler = &node->internal_cqe_handler;
#ifdef LIBURING_HAVE_DATA64
io_uring_prep_poll_remove(sqe, (uintptr_t)node);
io_uring_prep_poll_remove(sqe, (uintptr_t)cqe_handler);
#else
io_uring_prep_poll_remove(sqe, node);
io_uring_prep_poll_remove(sqe, cqe_handler);
#endif
io_uring_sqe_set_data(sqe, NULL);
}
/* Add a timeout that self-cancels when another cqe becomes ready */
static void add_timeout_sqe(AioContext *ctx, int64_t ns)
{
struct io_uring_sqe *sqe;
struct __kernel_timespec ts = {
.tv_sec = ns / NANOSECONDS_PER_SECOND,
.tv_nsec = ns % NANOSECONDS_PER_SECOND,
};
sqe = get_sqe(ctx);
io_uring_prep_timeout(sqe, &ts, 1, 0);
io_uring_sqe_set_data(sqe, NULL);
}
/* Add sqes from ctx->submit_list for submission */
static void fill_sq_ring(AioContext *ctx)
{
@ -218,22 +231,26 @@ static void fill_sq_ring(AioContext *ctx)
if (flags & FDMON_IO_URING_REMOVE) {
add_poll_remove_sqe(ctx, node);
}
if (flags & FDMON_IO_URING_DELETE_AIO_HANDLER) {
/*
* process_cqe() sets this flag after ADD and REMOVE have been
* cleared. They cannot be set again, so they must be clear.
*/
assert(!(flags & FDMON_IO_URING_ADD));
assert(!(flags & FDMON_IO_URING_REMOVE));
QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
}
}
}
/* Returns true if a handler became ready */
static bool process_cqe(AioContext *ctx,
AioHandlerList *ready_list,
struct io_uring_cqe *cqe)
static bool process_cqe_aio_handler(AioContext *ctx,
AioHandlerList *ready_list,
AioHandler *node,
struct io_uring_cqe *cqe)
{
AioHandler *node = io_uring_cqe_get_data(cqe);
unsigned flags;
/* poll_timeout and poll_remove have a zero user_data field */
if (!node) {
return false;
}
/*
* Deletion can only happen when IORING_OP_POLL_ADD completes. If we race
* with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE
@ -241,7 +258,12 @@ static bool process_cqe(AioContext *ctx,
*/
flags = qatomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE);
if (flags & FDMON_IO_URING_REMOVE) {
QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
if (flags & FDMON_IO_URING_PENDING) {
/* Still on ctx->submit_list, defer deletion until fill_sq_ring() */
qatomic_or(&node->flags, FDMON_IO_URING_DELETE_AIO_HANDLER);
} else {
QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
}
return false;
}
@ -252,6 +274,35 @@ static bool process_cqe(AioContext *ctx,
return true;
}
/* Returns true if a handler became ready */
static bool process_cqe(AioContext *ctx,
AioHandlerList *ready_list,
struct io_uring_cqe *cqe)
{
CqeHandler *cqe_handler = io_uring_cqe_get_data(cqe);
/* poll_timeout and poll_remove have a zero user_data field */
if (!cqe_handler) {
return false;
}
/*
* Special handling for AioHandler cqes. They need ready_list and have a
* return value.
*/
if (cqe_handler->cb == fdmon_special_cqe_handler) {
AioHandler *node = container_of(cqe_handler, AioHandler,
internal_cqe_handler);
return process_cqe_aio_handler(ctx, ready_list, node, cqe);
}
cqe_handler->cqe = *cqe;
/* Handlers are invoked later by fdmon_io_uring_dispatch() */
QSIMPLEQ_INSERT_TAIL(&ctx->cqe_handler_ready_list, cqe_handler, next);
return false;
}
static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list)
{
struct io_uring *ring = &ctx->fdmon_io_uring;
@ -260,6 +311,13 @@ static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list)
unsigned num_ready = 0;
unsigned head;
#ifdef HAVE_IO_URING_CQ_HAS_OVERFLOW
/* If the CQ overflowed then fetch CQEs with a syscall */
if (io_uring_cq_has_overflow(ring)) {
io_uring_get_events(ring);
}
#endif
io_uring_for_each_cqe(ring, head, cqe) {
if (process_cqe(ctx, ready_list, cqe)) {
num_ready++;
@ -272,23 +330,91 @@ static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list)
return num_ready;
}
/* This is where SQEs are submitted in the glib event loop */
static void fdmon_io_uring_gsource_prepare(AioContext *ctx)
{
fill_sq_ring(ctx);
if (io_uring_sq_ready(&ctx->fdmon_io_uring)) {
while (io_uring_submit(&ctx->fdmon_io_uring) == -EINTR) {
/* Keep trying if syscall was interrupted */
}
}
}
static bool fdmon_io_uring_gsource_check(AioContext *ctx)
{
gpointer tag = ctx->io_uring_fd_tag;
return g_source_query_unix_fd(&ctx->source, tag) & G_IO_IN;
}
/* Dispatch CQE handlers that are ready */
static bool fdmon_io_uring_dispatch(AioContext *ctx)
{
CqeHandlerSimpleQ *ready_list = &ctx->cqe_handler_ready_list;
bool progress = false;
/* Handlers may use defer_call() to coalesce frequent operations */
defer_call_begin();
while (!QSIMPLEQ_EMPTY(ready_list)) {
CqeHandler *cqe_handler = QSIMPLEQ_FIRST(ready_list);
QSIMPLEQ_REMOVE_HEAD(ready_list, next);
trace_fdmon_io_uring_cqe_handler(ctx, cqe_handler,
cqe_handler->cqe.res);
cqe_handler->cb(cqe_handler);
progress = true;
}
defer_call_end();
return progress;
}
/* This is where CQEs are processed in the glib event loop */
static void fdmon_io_uring_gsource_dispatch(AioContext *ctx,
AioHandlerList *ready_list)
{
process_cq_ring(ctx, ready_list);
}
static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
int64_t timeout)
{
struct __kernel_timespec ts;
unsigned wait_nr = 1; /* block until at least one cqe is ready */
int ret;
if (timeout == 0) {
wait_nr = 0; /* non-blocking */
} else if (timeout > 0) {
add_timeout_sqe(ctx, timeout);
/* Add a timeout that self-cancels when another cqe becomes ready */
struct io_uring_sqe *sqe;
ts = (struct __kernel_timespec){
.tv_sec = timeout / NANOSECONDS_PER_SECOND,
.tv_nsec = timeout % NANOSECONDS_PER_SECOND,
};
sqe = get_sqe(ctx);
io_uring_prep_timeout(sqe, &ts, 1, 0);
io_uring_sqe_set_data(sqe, NULL);
}
fill_sq_ring(ctx);
/*
* Loop to handle signals in both cases:
* 1. If no SQEs were submitted, then -EINTR is returned.
* 2. If SQEs were submitted then the number of SQEs submitted is returned
* rather than -EINTR.
*/
do {
ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
} while (ret == -EINTR);
} while (ret == -EINTR ||
(ret >= 0 && wait_nr > io_uring_cq_ready(&ctx->fdmon_io_uring)));
assert(ret >= 0);
@ -319,43 +445,66 @@ static const FDMonOps fdmon_io_uring_ops = {
.update = fdmon_io_uring_update,
.wait = fdmon_io_uring_wait,
.need_wait = fdmon_io_uring_need_wait,
.dispatch = fdmon_io_uring_dispatch,
.gsource_prepare = fdmon_io_uring_gsource_prepare,
.gsource_check = fdmon_io_uring_gsource_check,
.gsource_dispatch = fdmon_io_uring_gsource_dispatch,
.add_sqe = fdmon_io_uring_add_sqe,
};
bool fdmon_io_uring_setup(AioContext *ctx)
bool fdmon_io_uring_setup(AioContext *ctx, Error **errp)
{
int ret;
ctx->io_uring_fd_tag = NULL;
ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
if (ret != 0) {
error_setg_errno(errp, -ret, "Failed to initialize io_uring");
return false;
}
QSLIST_INIT(&ctx->submit_list);
QSIMPLEQ_INIT(&ctx->cqe_handler_ready_list);
ctx->fdmon_ops = &fdmon_io_uring_ops;
ctx->io_uring_fd_tag = g_source_add_unix_fd(&ctx->source,
ctx->fdmon_io_uring.ring_fd, G_IO_IN);
return true;
}
void fdmon_io_uring_destroy(AioContext *ctx)
{
if (ctx->fdmon_ops == &fdmon_io_uring_ops) {
AioHandler *node;
AioHandler *node;
io_uring_queue_exit(&ctx->fdmon_io_uring);
if (ctx->fdmon_ops != &fdmon_io_uring_ops) {
return;
}
/* Move handlers due to be removed onto the deleted list */
while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) {
unsigned flags = qatomic_fetch_and(&node->flags,
~(FDMON_IO_URING_PENDING |
FDMON_IO_URING_ADD |
FDMON_IO_URING_REMOVE));
io_uring_queue_exit(&ctx->fdmon_io_uring);
if (flags & FDMON_IO_URING_REMOVE) {
QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
}
/* Move handlers due to be removed onto the deleted list */
while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) {
unsigned flags = qatomic_fetch_and(&node->flags,
~(FDMON_IO_URING_PENDING |
FDMON_IO_URING_ADD |
FDMON_IO_URING_REMOVE |
FDMON_IO_URING_DELETE_AIO_HANDLER));
QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted);
if ((flags & FDMON_IO_URING_REMOVE) ||
(flags & FDMON_IO_URING_DELETE_AIO_HANDLER)) {
QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers,
node, node_deleted);
}
ctx->fdmon_ops = &fdmon_poll_ops;
QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted);
}
g_source_remove_unix_fd(&ctx->source, ctx->io_uring_fd_tag);
ctx->io_uring_fd_tag = NULL;
assert(QSIMPLEQ_EMPTY(&ctx->cqe_handler_ready_list));
qemu_lockcnt_lock(&ctx->list_lock);
fdmon_poll_downgrade(ctx);
qemu_lockcnt_unlock(&ctx->list_lock);
}

View file

@ -72,6 +72,11 @@ static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list,
/* epoll(7) is faster above a certain number of fds */
if (fdmon_epoll_try_upgrade(ctx, npfd)) {
QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events) {
g_source_remove_poll(&ctx->source, &node->pfd);
}
}
npfd = 0; /* we won't need pollfds[], reset npfd */
return ctx->fdmon_ops->wait(ctx, ready_list, timeout);
}
@ -97,11 +102,89 @@ static void fdmon_poll_update(AioContext *ctx,
AioHandler *old_node,
AioHandler *new_node)
{
/* Do nothing, AioHandler already contains the state we'll need */
if (old_node) {
/*
* If the GSource is in the process of being destroyed then
* g_source_remove_poll() causes an assertion failure. Skip removal in
* that case, because glib cleans up its state during destruction
* anyway.
*/
if (!g_source_is_destroyed(&ctx->source)) {
g_source_remove_poll(&ctx->source, &old_node->pfd);
}
}
if (new_node) {
g_source_add_poll(&ctx->source, &new_node->pfd);
}
}
static void fdmon_poll_gsource_prepare(AioContext *ctx)
{
/* Do nothing */
}
static bool fdmon_poll_gsource_check(AioContext *ctx)
{
AioHandler *node;
bool result = false;
/*
* We have to walk very carefully in case aio_set_fd_handler is
* called while we're walking.
*/
qemu_lockcnt_inc(&ctx->list_lock);
QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
int revents = node->pfd.revents & node->pfd.events;
if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read) {
result = true;
break;
}
if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write) {
result = true;
break;
}
}
qemu_lockcnt_dec(&ctx->list_lock);
return result;
}
static void fdmon_poll_gsource_dispatch(AioContext *ctx,
AioHandlerList *ready_list)
{
AioHandler *node;
QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
int revents = node->pfd.revents;
if (revents) {
aio_add_ready_handler(ready_list, node, revents);
}
}
}
const FDMonOps fdmon_poll_ops = {
.update = fdmon_poll_update,
.wait = fdmon_poll_wait,
.need_wait = aio_poll_disabled,
.gsource_prepare = fdmon_poll_gsource_prepare,
.gsource_check = fdmon_poll_gsource_check,
.gsource_dispatch = fdmon_poll_gsource_dispatch,
};
void fdmon_poll_downgrade(AioContext *ctx)
{
AioHandler *node;
ctx->fdmon_ops = &fdmon_poll_ops;
QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events) {
g_source_add_poll(&ctx->source, &node->pfd);
}
}
}

View file

@ -24,6 +24,10 @@ buffer_move_empty(const char *buf, size_t len, const char *from) "%s: %zd bytes
buffer_move(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s"
buffer_free(const char *buf, size_t len) "%s: capacity %zd"
# fdmon-io_uring.c
fdmon_io_uring_add_sqe(void *ctx, void *opaque, int opcode, int fd, uint64_t off, void *cqe_handler) "ctx %p opaque %p opcode %d fd %d off %"PRId64" cqe_handler %p"
fdmon_io_uring_cqe_handler(void *ctx, void *cqe_handler, int cqe_res) "ctx %p cqe_handler %p cqe_res %d"
# filemonitor-inotify.c
qemu_file_monitor_add_watch(void *mon, const char *dirpath, const char *filename, void *cb, void *opaque, int64_t id) "File monitor %p add watch dir='%s' file='%s' cb=%p opaque=%p id=%" PRId64
qemu_file_monitor_remove_watch(void *mon, const char *dirpath, int64_t id) "File monitor %p remove watch dir='%s' id=%" PRId64