block/io_uring: use aio_add_sqe()
AioContext has its own io_uring instance for file descriptor monitoring. The disk I/O io_uring code was developed separately. Originally I thought the characteristics of file descriptor monitoring and disk I/O were too different, requiring separate io_uring instances. Now it has become clear to me that it's feasible to share a single io_uring instance for file descriptor monitoring and disk I/O. We're not using io_uring's IOPOLL feature or anything else that would require a separate instance. Unify block/io_uring.c and util/fdmon-io_uring.c using the new aio_add_sqe() API that allows user-defined io_uring sqe submission. Now block/io_uring.c just needs to submit readv/writev/fsync and most of the io_uring-specific logic is handled by fdmon-io_uring.c. There are two immediate advantages: 1. Fewer system calls. There is no need to monitor the disk I/O io_uring ring fd from the file descriptor monitoring io_uring instance. Disk I/O completions are now picked up directly. Also, sqes are accumulated in the sq ring until the end of the event loop iteration and there are fewer io_uring_enter(2) syscalls. 2. Less code duplication. Note that error_setg() messages are not supposed to end with punctuation, so I removed a '.' for the non-io_uring build error message. Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Reviewed-by: Eric Blake <eblake@redhat.com> Message-ID: <20251104022933.618123-15-stefanha@redhat.com> Reviewed-by: Kevin Wolf <kwolf@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
This commit is contained in:
parent
1eebdab3c3
commit
047dabef97
8 changed files with 130 additions and 493 deletions
|
|
@ -755,14 +755,23 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
|
||||||
}
|
}
|
||||||
#endif /* !defined(CONFIG_LINUX_AIO) */
|
#endif /* !defined(CONFIG_LINUX_AIO) */
|
||||||
|
|
||||||
#ifndef CONFIG_LINUX_IO_URING
|
|
||||||
if (s->use_linux_io_uring) {
|
if (s->use_linux_io_uring) {
|
||||||
error_setg(errp, "aio=io_uring was specified, but is not supported "
|
#ifdef CONFIG_LINUX_IO_URING
|
||||||
"in this build.");
|
if (!aio_has_io_uring()) {
|
||||||
|
error_setg(errp, "aio=io_uring was specified, but is not "
|
||||||
|
"available (disabled via io_uring_disabled "
|
||||||
|
"sysctl or blocked by container runtime "
|
||||||
|
"seccomp policy?)");
|
||||||
ret = -EINVAL;
|
ret = -EINVAL;
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
error_setg(errp, "aio=io_uring was specified, but is not supported "
|
||||||
|
"in this build");
|
||||||
|
ret = -EINVAL;
|
||||||
|
goto fail;
|
||||||
#endif /* !defined(CONFIG_LINUX_IO_URING) */
|
#endif /* !defined(CONFIG_LINUX_IO_URING) */
|
||||||
|
}
|
||||||
|
|
||||||
s->has_discard = true;
|
s->has_discard = true;
|
||||||
s->has_write_zeroes = true;
|
s->has_write_zeroes = true;
|
||||||
|
|
@ -2522,27 +2531,6 @@ static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_LINUX_IO_URING
|
|
||||||
static inline bool raw_check_linux_io_uring(BDRVRawState *s)
|
|
||||||
{
|
|
||||||
Error *local_err = NULL;
|
|
||||||
AioContext *ctx;
|
|
||||||
|
|
||||||
if (!s->use_linux_io_uring) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
ctx = qemu_get_current_aio_context();
|
|
||||||
if (unlikely(!aio_setup_linux_io_uring(ctx, &local_err))) {
|
|
||||||
error_reportf_err(local_err, "Unable to use linux io_uring, "
|
|
||||||
"falling back to thread pool: ");
|
|
||||||
s->use_linux_io_uring = false;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef CONFIG_LINUX_AIO
|
#ifdef CONFIG_LINUX_AIO
|
||||||
static inline bool raw_check_linux_aio(BDRVRawState *s)
|
static inline bool raw_check_linux_aio(BDRVRawState *s)
|
||||||
{
|
{
|
||||||
|
|
@ -2595,7 +2583,7 @@ raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr, uint64_t bytes,
|
||||||
if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
|
if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
|
||||||
type |= QEMU_AIO_MISALIGNED;
|
type |= QEMU_AIO_MISALIGNED;
|
||||||
#ifdef CONFIG_LINUX_IO_URING
|
#ifdef CONFIG_LINUX_IO_URING
|
||||||
} else if (raw_check_linux_io_uring(s)) {
|
} else if (s->use_linux_io_uring) {
|
||||||
assert(qiov->size == bytes);
|
assert(qiov->size == bytes);
|
||||||
ret = luring_co_submit(bs, s->fd, offset, qiov, type, flags);
|
ret = luring_co_submit(bs, s->fd, offset, qiov, type, flags);
|
||||||
goto out;
|
goto out;
|
||||||
|
|
@ -2692,7 +2680,7 @@ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef CONFIG_LINUX_IO_URING
|
#ifdef CONFIG_LINUX_IO_URING
|
||||||
if (raw_check_linux_io_uring(s)) {
|
if (s->use_linux_io_uring) {
|
||||||
return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH, 0);
|
return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH, 0);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
461
block/io_uring.c
461
block/io_uring.c
|
|
@ -11,28 +11,20 @@
|
||||||
#include "qemu/osdep.h"
|
#include "qemu/osdep.h"
|
||||||
#include <liburing.h>
|
#include <liburing.h>
|
||||||
#include "block/aio.h"
|
#include "block/aio.h"
|
||||||
#include "qemu/queue.h"
|
|
||||||
#include "block/block.h"
|
#include "block/block.h"
|
||||||
#include "block/raw-aio.h"
|
#include "block/raw-aio.h"
|
||||||
#include "qemu/coroutine.h"
|
#include "qemu/coroutine.h"
|
||||||
#include "qemu/defer-call.h"
|
|
||||||
#include "qapi/error.h"
|
|
||||||
#include "system/block-backend.h"
|
#include "system/block-backend.h"
|
||||||
#include "trace.h"
|
#include "trace.h"
|
||||||
|
|
||||||
/* Only used for assertions. */
|
typedef struct {
|
||||||
#include "qemu/coroutine_int.h"
|
|
||||||
|
|
||||||
/* io_uring ring size */
|
|
||||||
#define MAX_ENTRIES 128
|
|
||||||
|
|
||||||
typedef struct LuringAIOCB {
|
|
||||||
Coroutine *co;
|
Coroutine *co;
|
||||||
struct io_uring_sqe sqeq;
|
|
||||||
ssize_t ret;
|
|
||||||
QEMUIOVector *qiov;
|
QEMUIOVector *qiov;
|
||||||
bool is_read;
|
uint64_t offset;
|
||||||
QSIMPLEQ_ENTRY(LuringAIOCB) next;
|
ssize_t ret;
|
||||||
|
int type;
|
||||||
|
int fd;
|
||||||
|
BdrvRequestFlags flags;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Buffered reads may require resubmission, see
|
* Buffered reads may require resubmission, see
|
||||||
|
|
@ -40,36 +32,51 @@ typedef struct LuringAIOCB {
|
||||||
*/
|
*/
|
||||||
int total_read;
|
int total_read;
|
||||||
QEMUIOVector resubmit_qiov;
|
QEMUIOVector resubmit_qiov;
|
||||||
} LuringAIOCB;
|
|
||||||
|
|
||||||
typedef struct LuringQueue {
|
CqeHandler cqe_handler;
|
||||||
unsigned int in_queue;
|
} LuringRequest;
|
||||||
unsigned int in_flight;
|
|
||||||
bool blocked;
|
|
||||||
QSIMPLEQ_HEAD(, LuringAIOCB) submit_queue;
|
|
||||||
} LuringQueue;
|
|
||||||
|
|
||||||
struct LuringState {
|
static void luring_prep_sqe(struct io_uring_sqe *sqe, void *opaque)
|
||||||
AioContext *aio_context;
|
|
||||||
|
|
||||||
struct io_uring ring;
|
|
||||||
|
|
||||||
/* No locking required, only accessed from AioContext home thread */
|
|
||||||
LuringQueue io_q;
|
|
||||||
|
|
||||||
QEMUBH *completion_bh;
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* luring_resubmit:
|
|
||||||
*
|
|
||||||
* Resubmit a request by appending it to submit_queue. The caller must ensure
|
|
||||||
* that ioq_submit() is called later so that submit_queue requests are started.
|
|
||||||
*/
|
|
||||||
static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb)
|
|
||||||
{
|
{
|
||||||
QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
|
LuringRequest *req = opaque;
|
||||||
s->io_q.in_queue++;
|
QEMUIOVector *qiov = req->qiov;
|
||||||
|
uint64_t offset = req->offset;
|
||||||
|
int fd = req->fd;
|
||||||
|
BdrvRequestFlags flags = req->flags;
|
||||||
|
|
||||||
|
switch (req->type) {
|
||||||
|
case QEMU_AIO_WRITE:
|
||||||
|
#ifdef HAVE_IO_URING_PREP_WRITEV2
|
||||||
|
{
|
||||||
|
int luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
|
||||||
|
io_uring_prep_writev2(sqe, fd, qiov->iov,
|
||||||
|
qiov->niov, offset, luring_flags);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
assert(flags == 0);
|
||||||
|
io_uring_prep_writev(sqe, fd, qiov->iov, qiov->niov, offset);
|
||||||
|
#endif
|
||||||
|
break;
|
||||||
|
case QEMU_AIO_ZONE_APPEND:
|
||||||
|
io_uring_prep_writev(sqe, fd, qiov->iov, qiov->niov, offset);
|
||||||
|
break;
|
||||||
|
case QEMU_AIO_READ:
|
||||||
|
{
|
||||||
|
if (req->resubmit_qiov.iov != NULL) {
|
||||||
|
qiov = &req->resubmit_qiov;
|
||||||
|
}
|
||||||
|
io_uring_prep_readv(sqe, fd, qiov->iov, qiov->niov,
|
||||||
|
offset + req->total_read);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case QEMU_AIO_FLUSH:
|
||||||
|
io_uring_prep_fsync(sqe, fd, IORING_FSYNC_DATASYNC);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
fprintf(stderr, "%s: invalid AIO request type, aborting 0x%x.\n",
|
||||||
|
__func__, req->type);
|
||||||
|
abort();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -78,92 +85,35 @@ static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb)
|
||||||
* Short reads are rare but may occur. The remaining read request needs to be
|
* Short reads are rare but may occur. The remaining read request needs to be
|
||||||
* resubmitted.
|
* resubmitted.
|
||||||
*/
|
*/
|
||||||
static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb,
|
static void luring_resubmit_short_read(LuringRequest *req, int nread)
|
||||||
int nread)
|
|
||||||
{
|
{
|
||||||
QEMUIOVector *resubmit_qiov;
|
QEMUIOVector *resubmit_qiov;
|
||||||
size_t remaining;
|
size_t remaining;
|
||||||
|
|
||||||
trace_luring_resubmit_short_read(s, luringcb, nread);
|
trace_luring_resubmit_short_read(req, nread);
|
||||||
|
|
||||||
/* Update read position */
|
/* Update read position */
|
||||||
luringcb->total_read += nread;
|
req->total_read += nread;
|
||||||
remaining = luringcb->qiov->size - luringcb->total_read;
|
remaining = req->qiov->size - req->total_read;
|
||||||
|
|
||||||
/* Shorten qiov */
|
/* Shorten qiov */
|
||||||
resubmit_qiov = &luringcb->resubmit_qiov;
|
resubmit_qiov = &req->resubmit_qiov;
|
||||||
if (resubmit_qiov->iov == NULL) {
|
if (resubmit_qiov->iov == NULL) {
|
||||||
qemu_iovec_init(resubmit_qiov, luringcb->qiov->niov);
|
qemu_iovec_init(resubmit_qiov, req->qiov->niov);
|
||||||
} else {
|
} else {
|
||||||
qemu_iovec_reset(resubmit_qiov);
|
qemu_iovec_reset(resubmit_qiov);
|
||||||
}
|
}
|
||||||
qemu_iovec_concat(resubmit_qiov, luringcb->qiov, luringcb->total_read,
|
qemu_iovec_concat(resubmit_qiov, req->qiov, req->total_read, remaining);
|
||||||
remaining);
|
|
||||||
|
|
||||||
/* Update sqe */
|
aio_add_sqe(luring_prep_sqe, req, &req->cqe_handler);
|
||||||
luringcb->sqeq.off += nread;
|
|
||||||
luringcb->sqeq.addr = (uintptr_t)luringcb->resubmit_qiov.iov;
|
|
||||||
luringcb->sqeq.len = luringcb->resubmit_qiov.niov;
|
|
||||||
|
|
||||||
luring_resubmit(s, luringcb);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
static void luring_cqe_handler(CqeHandler *cqe_handler)
|
||||||
* luring_process_completions:
|
|
||||||
* @s: AIO state
|
|
||||||
*
|
|
||||||
* Fetches completed I/O requests, consumes cqes and invokes their callbacks
|
|
||||||
* The function is somewhat tricky because it supports nested event loops, for
|
|
||||||
* example when a request callback invokes aio_poll().
|
|
||||||
*
|
|
||||||
* Function schedules BH completion so it can be called again in a nested
|
|
||||||
* event loop. When there are no events left to complete the BH is being
|
|
||||||
* canceled.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
static void luring_process_completions(LuringState *s)
|
|
||||||
{
|
{
|
||||||
struct io_uring_cqe *cqes;
|
LuringRequest *req = container_of(cqe_handler, LuringRequest, cqe_handler);
|
||||||
int total_bytes;
|
int ret = cqe_handler->cqe.res;
|
||||||
|
|
||||||
defer_call_begin();
|
trace_luring_cqe_handler(req, ret);
|
||||||
|
|
||||||
/*
|
|
||||||
* Request completion callbacks can run the nested event loop.
|
|
||||||
* Schedule ourselves so the nested event loop will "see" remaining
|
|
||||||
* completed requests and process them. Without this, completion
|
|
||||||
* callbacks that wait for other requests using a nested event loop
|
|
||||||
* would hang forever.
|
|
||||||
*
|
|
||||||
* This workaround is needed because io_uring uses poll_wait, which
|
|
||||||
* is woken up when new events are added to the uring, thus polling on
|
|
||||||
* the same uring fd will block unless more events are received.
|
|
||||||
*
|
|
||||||
* Other leaf block drivers (drivers that access the data themselves)
|
|
||||||
* are networking based, so they poll sockets for data and run the
|
|
||||||
* correct coroutine.
|
|
||||||
*/
|
|
||||||
qemu_bh_schedule(s->completion_bh);
|
|
||||||
|
|
||||||
while (io_uring_peek_cqe(&s->ring, &cqes) == 0) {
|
|
||||||
LuringAIOCB *luringcb;
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
if (!cqes) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
luringcb = io_uring_cqe_get_data(cqes);
|
|
||||||
ret = cqes->res;
|
|
||||||
io_uring_cqe_seen(&s->ring, cqes);
|
|
||||||
cqes = NULL;
|
|
||||||
|
|
||||||
/* Change counters one-by-one because we can be nested. */
|
|
||||||
s->io_q.in_flight--;
|
|
||||||
trace_luring_process_completion(s, luringcb, ret);
|
|
||||||
|
|
||||||
/* total_read is non-zero only for resubmitted read requests */
|
|
||||||
total_bytes = ret + luringcb->total_read;
|
|
||||||
|
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
/*
|
/*
|
||||||
|
|
@ -181,282 +131,69 @@ static void luring_process_completions(LuringState *s)
|
||||||
* immediately.
|
* immediately.
|
||||||
*/
|
*/
|
||||||
if (ret == -EINTR || ret == -EAGAIN) {
|
if (ret == -EINTR || ret == -EAGAIN) {
|
||||||
luring_resubmit(s, luringcb);
|
aio_add_sqe(luring_prep_sqe, req, &req->cqe_handler);
|
||||||
continue;
|
return;
|
||||||
}
|
}
|
||||||
} else if (!luringcb->qiov) {
|
} else if (req->qiov) {
|
||||||
goto end;
|
/* total_read is non-zero only for resubmitted read requests */
|
||||||
} else if (total_bytes == luringcb->qiov->size) {
|
int total_bytes = ret + req->total_read;
|
||||||
|
|
||||||
|
if (total_bytes == req->qiov->size) {
|
||||||
ret = 0;
|
ret = 0;
|
||||||
/* Only read/write */
|
|
||||||
} else {
|
} else {
|
||||||
/* Short Read/Write */
|
/* Short Read/Write */
|
||||||
if (luringcb->is_read) {
|
if (req->type == QEMU_AIO_READ) {
|
||||||
if (ret > 0) {
|
if (ret > 0) {
|
||||||
luring_resubmit_short_read(s, luringcb, ret);
|
luring_resubmit_short_read(req, ret);
|
||||||
continue;
|
return;
|
||||||
} else {
|
|
||||||
/* Pad with zeroes */
|
|
||||||
qemu_iovec_memset(luringcb->qiov, total_bytes, 0,
|
|
||||||
luringcb->qiov->size - total_bytes);
|
|
||||||
ret = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Pad with zeroes */
|
||||||
|
qemu_iovec_memset(req->qiov, total_bytes, 0,
|
||||||
|
req->qiov->size - total_bytes);
|
||||||
|
ret = 0;
|
||||||
} else {
|
} else {
|
||||||
ret = -ENOSPC;
|
ret = -ENOSPC;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
end:
|
}
|
||||||
luringcb->ret = ret;
|
|
||||||
qemu_iovec_destroy(&luringcb->resubmit_qiov);
|
req->ret = ret;
|
||||||
|
qemu_iovec_destroy(&req->resubmit_qiov);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the coroutine is already entered it must be in ioq_submit()
|
* If the coroutine is already entered it must be in luring_co_submit() and
|
||||||
* and will notice luringcb->ret has been filled in when it
|
* will notice req->ret has been filled in when it eventually runs later.
|
||||||
* eventually runs later. Coroutines cannot be entered recursively
|
* Coroutines cannot be entered recursively so avoid doing that!
|
||||||
* so avoid doing that!
|
|
||||||
*/
|
*/
|
||||||
assert(luringcb->co->ctx == s->aio_context);
|
if (!qemu_coroutine_entered(req->co)) {
|
||||||
if (!qemu_coroutine_entered(luringcb->co)) {
|
aio_co_wake(req->co);
|
||||||
aio_co_wake(luringcb->co);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
qemu_bh_cancel(s->completion_bh);
|
int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd,
|
||||||
|
uint64_t offset, QEMUIOVector *qiov,
|
||||||
defer_call_end();
|
int type, BdrvRequestFlags flags)
|
||||||
}
|
|
||||||
|
|
||||||
static int ioq_submit(LuringState *s)
|
|
||||||
{
|
{
|
||||||
int ret = 0;
|
LuringRequest req = {
|
||||||
LuringAIOCB *luringcb, *luringcb_next;
|
|
||||||
|
|
||||||
while (s->io_q.in_queue > 0) {
|
|
||||||
/*
|
|
||||||
* Try to fetch sqes from the ring for requests waiting in
|
|
||||||
* the overflow queue
|
|
||||||
*/
|
|
||||||
QSIMPLEQ_FOREACH_SAFE(luringcb, &s->io_q.submit_queue, next,
|
|
||||||
luringcb_next) {
|
|
||||||
struct io_uring_sqe *sqes = io_uring_get_sqe(&s->ring);
|
|
||||||
if (!sqes) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
/* Prep sqe for submission */
|
|
||||||
*sqes = luringcb->sqeq;
|
|
||||||
QSIMPLEQ_REMOVE_HEAD(&s->io_q.submit_queue, next);
|
|
||||||
}
|
|
||||||
ret = io_uring_submit(&s->ring);
|
|
||||||
trace_luring_io_uring_submit(s, ret);
|
|
||||||
/* Prevent infinite loop if submission is refused */
|
|
||||||
if (ret <= 0) {
|
|
||||||
if (ret == -EAGAIN || ret == -EINTR) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
s->io_q.in_flight += ret;
|
|
||||||
s->io_q.in_queue -= ret;
|
|
||||||
}
|
|
||||||
s->io_q.blocked = (s->io_q.in_queue > 0);
|
|
||||||
|
|
||||||
if (s->io_q.in_flight) {
|
|
||||||
/*
|
|
||||||
* We can try to complete something just right away if there are
|
|
||||||
* still requests in-flight.
|
|
||||||
*/
|
|
||||||
luring_process_completions(s);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void luring_process_completions_and_submit(LuringState *s)
|
|
||||||
{
|
|
||||||
luring_process_completions(s);
|
|
||||||
|
|
||||||
if (s->io_q.in_queue > 0) {
|
|
||||||
ioq_submit(s);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void qemu_luring_completion_bh(void *opaque)
|
|
||||||
{
|
|
||||||
LuringState *s = opaque;
|
|
||||||
luring_process_completions_and_submit(s);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void qemu_luring_completion_cb(void *opaque)
|
|
||||||
{
|
|
||||||
LuringState *s = opaque;
|
|
||||||
luring_process_completions_and_submit(s);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool qemu_luring_poll_cb(void *opaque)
|
|
||||||
{
|
|
||||||
LuringState *s = opaque;
|
|
||||||
|
|
||||||
return io_uring_cq_ready(&s->ring);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void qemu_luring_poll_ready(void *opaque)
|
|
||||||
{
|
|
||||||
LuringState *s = opaque;
|
|
||||||
|
|
||||||
luring_process_completions_and_submit(s);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ioq_init(LuringQueue *io_q)
|
|
||||||
{
|
|
||||||
QSIMPLEQ_INIT(&io_q->submit_queue);
|
|
||||||
io_q->in_queue = 0;
|
|
||||||
io_q->in_flight = 0;
|
|
||||||
io_q->blocked = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void luring_deferred_fn(void *opaque)
|
|
||||||
{
|
|
||||||
LuringState *s = opaque;
|
|
||||||
trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue,
|
|
||||||
s->io_q.in_flight);
|
|
||||||
if (!s->io_q.blocked && s->io_q.in_queue > 0) {
|
|
||||||
ioq_submit(s);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* luring_do_submit:
|
|
||||||
* @fd: file descriptor for I/O
|
|
||||||
* @luringcb: AIO control block
|
|
||||||
* @s: AIO state
|
|
||||||
* @offset: offset for request
|
|
||||||
* @type: type of request
|
|
||||||
*
|
|
||||||
* Fetches sqes from ring, adds to pending queue and preps them
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
|
|
||||||
uint64_t offset, int type, BdrvRequestFlags flags)
|
|
||||||
{
|
|
||||||
int ret;
|
|
||||||
struct io_uring_sqe *sqes = &luringcb->sqeq;
|
|
||||||
|
|
||||||
switch (type) {
|
|
||||||
case QEMU_AIO_WRITE:
|
|
||||||
#ifdef HAVE_IO_URING_PREP_WRITEV2
|
|
||||||
{
|
|
||||||
int luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
|
|
||||||
io_uring_prep_writev2(sqes, fd, luringcb->qiov->iov,
|
|
||||||
luringcb->qiov->niov, offset, luring_flags);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
assert(flags == 0);
|
|
||||||
io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
|
|
||||||
luringcb->qiov->niov, offset);
|
|
||||||
#endif
|
|
||||||
break;
|
|
||||||
case QEMU_AIO_ZONE_APPEND:
|
|
||||||
io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
|
|
||||||
luringcb->qiov->niov, offset);
|
|
||||||
break;
|
|
||||||
case QEMU_AIO_READ:
|
|
||||||
io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
|
|
||||||
luringcb->qiov->niov, offset);
|
|
||||||
break;
|
|
||||||
case QEMU_AIO_FLUSH:
|
|
||||||
io_uring_prep_fsync(sqes, fd, IORING_FSYNC_DATASYNC);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
fprintf(stderr, "%s: invalid AIO request type, aborting 0x%x.\n",
|
|
||||||
__func__, type);
|
|
||||||
abort();
|
|
||||||
}
|
|
||||||
io_uring_sqe_set_data(sqes, luringcb);
|
|
||||||
|
|
||||||
QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next);
|
|
||||||
s->io_q.in_queue++;
|
|
||||||
trace_luring_do_submit(s, s->io_q.blocked, s->io_q.in_queue,
|
|
||||||
s->io_q.in_flight);
|
|
||||||
if (!s->io_q.blocked) {
|
|
||||||
if (s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES) {
|
|
||||||
ret = ioq_submit(s);
|
|
||||||
trace_luring_do_submit_done(s, ret);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
defer_call(luring_deferred_fn, s);
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
|
|
||||||
QEMUIOVector *qiov, int type,
|
|
||||||
BdrvRequestFlags flags)
|
|
||||||
{
|
|
||||||
int ret;
|
|
||||||
AioContext *ctx = qemu_get_current_aio_context();
|
|
||||||
LuringState *s = aio_get_linux_io_uring(ctx);
|
|
||||||
LuringAIOCB luringcb = {
|
|
||||||
.co = qemu_coroutine_self(),
|
.co = qemu_coroutine_self(),
|
||||||
.ret = -EINPROGRESS,
|
|
||||||
.qiov = qiov,
|
.qiov = qiov,
|
||||||
.is_read = (type == QEMU_AIO_READ),
|
.ret = -EINPROGRESS,
|
||||||
|
.type = type,
|
||||||
|
.fd = fd,
|
||||||
|
.offset = offset,
|
||||||
|
.flags = flags,
|
||||||
};
|
};
|
||||||
trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0,
|
|
||||||
type);
|
|
||||||
ret = luring_do_submit(fd, &luringcb, s, offset, type, flags);
|
|
||||||
|
|
||||||
if (ret < 0) {
|
req.cqe_handler.cb = luring_cqe_handler;
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (luringcb.ret == -EINPROGRESS) {
|
trace_luring_co_submit(bs, &req, fd, offset, qiov ? qiov->size : 0, type);
|
||||||
|
aio_add_sqe(luring_prep_sqe, &req, &req.cqe_handler);
|
||||||
|
|
||||||
|
if (req.ret == -EINPROGRESS) {
|
||||||
qemu_coroutine_yield();
|
qemu_coroutine_yield();
|
||||||
}
|
}
|
||||||
return luringcb.ret;
|
return req.ret;
|
||||||
}
|
|
||||||
|
|
||||||
void luring_detach_aio_context(LuringState *s, AioContext *old_context)
|
|
||||||
{
|
|
||||||
aio_set_fd_handler(old_context, s->ring.ring_fd,
|
|
||||||
NULL, NULL, NULL, NULL, s);
|
|
||||||
qemu_bh_delete(s->completion_bh);
|
|
||||||
s->aio_context = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
void luring_attach_aio_context(LuringState *s, AioContext *new_context)
|
|
||||||
{
|
|
||||||
s->aio_context = new_context;
|
|
||||||
s->completion_bh = aio_bh_new(new_context, qemu_luring_completion_bh, s);
|
|
||||||
aio_set_fd_handler(s->aio_context, s->ring.ring_fd,
|
|
||||||
qemu_luring_completion_cb, NULL,
|
|
||||||
qemu_luring_poll_cb, qemu_luring_poll_ready, s);
|
|
||||||
}
|
|
||||||
|
|
||||||
LuringState *luring_init(Error **errp)
|
|
||||||
{
|
|
||||||
int rc;
|
|
||||||
LuringState *s = g_new0(LuringState, 1);
|
|
||||||
struct io_uring *ring = &s->ring;
|
|
||||||
|
|
||||||
trace_luring_init_state(s, sizeof(*s));
|
|
||||||
|
|
||||||
rc = io_uring_queue_init(MAX_ENTRIES, ring, 0);
|
|
||||||
if (rc < 0) {
|
|
||||||
error_setg_errno(errp, -rc, "failed to init linux io_uring ring");
|
|
||||||
g_free(s);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
ioq_init(&s->io_q);
|
|
||||||
return s;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
void luring_cleanup(LuringState *s)
|
|
||||||
{
|
|
||||||
io_uring_queue_exit(&s->ring);
|
|
||||||
trace_luring_cleanup_state(s);
|
|
||||||
g_free(s);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool luring_has_fua(void)
|
bool luring_has_fua(void)
|
||||||
|
|
|
||||||
|
|
@ -62,15 +62,9 @@ qmp_block_stream(void *bs) "bs %p"
|
||||||
file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d"
|
file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d"
|
||||||
|
|
||||||
# io_uring.c
|
# io_uring.c
|
||||||
luring_init_state(void *s, size_t size) "s %p size %zu"
|
luring_cqe_handler(void *req, int ret) "req %p ret %d"
|
||||||
luring_cleanup_state(void *s) "%p freed"
|
luring_co_submit(void *bs, void *req, int fd, uint64_t offset, size_t nbytes, int type) "bs %p req %p fd %d offset %" PRId64 " nbytes %zd type %d"
|
||||||
luring_unplug_fn(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
|
luring_resubmit_short_read(void *req, int nread) "req %p nread %d"
|
||||||
luring_do_submit(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d"
|
|
||||||
luring_do_submit_done(void *s, int ret) "LuringState %p submitted to kernel %d"
|
|
||||||
luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, size_t nbytes, int type) "bs %p s %p luringcb %p fd %d offset %" PRId64 " nbytes %zd type %d"
|
|
||||||
luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d"
|
|
||||||
luring_io_uring_submit(void *s, int ret) "LuringState %p ret %d"
|
|
||||||
luring_resubmit_short_read(void *s, void *luringcb, int nread) "LuringState %p luringcb %p nread %d"
|
|
||||||
|
|
||||||
# qcow2.c
|
# qcow2.c
|
||||||
qcow2_add_task(void *co, void *bs, void *pool, const char *action, int cluster_type, uint64_t host_offset, uint64_t offset, uint64_t bytes, void *qiov, size_t qiov_offset) "co %p bs %p pool %p: %s: cluster_type %d file_cluster_offset %" PRIu64 " offset %" PRIu64 " bytes %" PRIu64 " qiov %p qiov_offset %zu"
|
qcow2_add_task(void *co, void *bs, void *pool, const char *action, int cluster_type, uint64_t host_offset, uint64_t offset, uint64_t bytes, void *qiov, size_t qiov_offset) "co %p bs %p pool %p: %s: cluster_type %d file_cluster_offset %" PRIu64 " offset %" PRIu64 " bytes %" PRIu64 " qiov %p qiov_offset %zu"
|
||||||
|
|
|
||||||
|
|
@ -310,8 +310,6 @@ struct AioContext {
|
||||||
struct LinuxAioState *linux_aio;
|
struct LinuxAioState *linux_aio;
|
||||||
#endif
|
#endif
|
||||||
#ifdef CONFIG_LINUX_IO_URING
|
#ifdef CONFIG_LINUX_IO_URING
|
||||||
LuringState *linux_io_uring;
|
|
||||||
|
|
||||||
/* State for file descriptor monitoring using Linux io_uring */
|
/* State for file descriptor monitoring using Linux io_uring */
|
||||||
struct io_uring fdmon_io_uring;
|
struct io_uring fdmon_io_uring;
|
||||||
AioHandlerSList submit_list;
|
AioHandlerSList submit_list;
|
||||||
|
|
@ -615,11 +613,6 @@ struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp);
|
||||||
/* Return the LinuxAioState bound to this AioContext */
|
/* Return the LinuxAioState bound to this AioContext */
|
||||||
struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);
|
struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);
|
||||||
|
|
||||||
/* Setup the LuringState bound to this AioContext */
|
|
||||||
LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp);
|
|
||||||
|
|
||||||
/* Return the LuringState bound to this AioContext */
|
|
||||||
LuringState *aio_get_linux_io_uring(AioContext *ctx);
|
|
||||||
/**
|
/**
|
||||||
* aio_timer_new_with_attrs:
|
* aio_timer_new_with_attrs:
|
||||||
* @ctx: the aio context
|
* @ctx: the aio context
|
||||||
|
|
|
||||||
|
|
@ -74,15 +74,10 @@ static inline bool laio_has_fua(void)
|
||||||
#endif
|
#endif
|
||||||
/* io_uring.c - Linux io_uring implementation */
|
/* io_uring.c - Linux io_uring implementation */
|
||||||
#ifdef CONFIG_LINUX_IO_URING
|
#ifdef CONFIG_LINUX_IO_URING
|
||||||
LuringState *luring_init(Error **errp);
|
|
||||||
void luring_cleanup(LuringState *s);
|
|
||||||
|
|
||||||
/* luring_co_submit: submit I/O requests in the thread's current AioContext. */
|
/* luring_co_submit: submit I/O requests in the thread's current AioContext. */
|
||||||
int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
|
int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset,
|
||||||
QEMUIOVector *qiov, int type,
|
QEMUIOVector *qiov, int type,
|
||||||
BdrvRequestFlags flags);
|
BdrvRequestFlags flags);
|
||||||
void luring_detach_aio_context(LuringState *s, AioContext *old_context);
|
|
||||||
void luring_attach_aio_context(LuringState *s, AioContext *new_context);
|
|
||||||
bool luring_has_fua(void);
|
bool luring_has_fua(void);
|
||||||
#else
|
#else
|
||||||
static inline bool luring_has_fua(void)
|
static inline bool luring_has_fua(void)
|
||||||
|
|
|
||||||
|
|
@ -1,32 +0,0 @@
|
||||||
/*
|
|
||||||
* Linux io_uring support.
|
|
||||||
*
|
|
||||||
* Copyright (C) 2009 IBM, Corp.
|
|
||||||
* Copyright (C) 2009 Red Hat, Inc.
|
|
||||||
*
|
|
||||||
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
|
||||||
* See the COPYING file in the top-level directory.
|
|
||||||
*/
|
|
||||||
#include "qemu/osdep.h"
|
|
||||||
#include "block/aio.h"
|
|
||||||
#include "block/raw-aio.h"
|
|
||||||
|
|
||||||
void luring_detach_aio_context(LuringState *s, AioContext *old_context)
|
|
||||||
{
|
|
||||||
abort();
|
|
||||||
}
|
|
||||||
|
|
||||||
void luring_attach_aio_context(LuringState *s, AioContext *new_context)
|
|
||||||
{
|
|
||||||
abort();
|
|
||||||
}
|
|
||||||
|
|
||||||
LuringState *luring_init(Error **errp)
|
|
||||||
{
|
|
||||||
abort();
|
|
||||||
}
|
|
||||||
|
|
||||||
void luring_cleanup(LuringState *s)
|
|
||||||
{
|
|
||||||
abort();
|
|
||||||
}
|
|
||||||
|
|
@ -32,9 +32,6 @@ if have_block or have_ga
|
||||||
stub_ss.add(files('cpus-virtual-clock.c'))
|
stub_ss.add(files('cpus-virtual-clock.c'))
|
||||||
stub_ss.add(files('icount.c'))
|
stub_ss.add(files('icount.c'))
|
||||||
stub_ss.add(files('graph-lock.c'))
|
stub_ss.add(files('graph-lock.c'))
|
||||||
if linux_io_uring.found()
|
|
||||||
stub_ss.add(files('io_uring.c'))
|
|
||||||
endif
|
|
||||||
if libaio.found()
|
if libaio.found()
|
||||||
stub_ss.add(files('linux-aio.c'))
|
stub_ss.add(files('linux-aio.c'))
|
||||||
endif
|
endif
|
||||||
|
|
|
||||||
35
util/async.c
35
util/async.c
|
|
@ -386,14 +386,6 @@ aio_ctx_finalize(GSource *source)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_LINUX_IO_URING
|
|
||||||
if (ctx->linux_io_uring) {
|
|
||||||
luring_detach_aio_context(ctx->linux_io_uring, ctx);
|
|
||||||
luring_cleanup(ctx->linux_io_uring);
|
|
||||||
ctx->linux_io_uring = NULL;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
|
assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
|
||||||
qemu_bh_delete(ctx->co_schedule_bh);
|
qemu_bh_delete(ctx->co_schedule_bh);
|
||||||
|
|
||||||
|
|
@ -469,29 +461,6 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_LINUX_IO_URING
|
|
||||||
LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp)
|
|
||||||
{
|
|
||||||
if (ctx->linux_io_uring) {
|
|
||||||
return ctx->linux_io_uring;
|
|
||||||
}
|
|
||||||
|
|
||||||
ctx->linux_io_uring = luring_init(errp);
|
|
||||||
if (!ctx->linux_io_uring) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
luring_attach_aio_context(ctx->linux_io_uring, ctx);
|
|
||||||
return ctx->linux_io_uring;
|
|
||||||
}
|
|
||||||
|
|
||||||
LuringState *aio_get_linux_io_uring(AioContext *ctx)
|
|
||||||
{
|
|
||||||
assert(ctx->linux_io_uring);
|
|
||||||
return ctx->linux_io_uring;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void aio_notify(AioContext *ctx)
|
void aio_notify(AioContext *ctx)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
|
|
@ -631,10 +600,6 @@ AioContext *aio_context_new(Error **errp)
|
||||||
ctx->linux_aio = NULL;
|
ctx->linux_aio = NULL;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_LINUX_IO_URING
|
|
||||||
ctx->linux_io_uring = NULL;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ctx->thread_pool = NULL;
|
ctx->thread_pool = NULL;
|
||||||
qemu_rec_mutex_init(&ctx->lock);
|
qemu_rec_mutex_init(&ctx->lock);
|
||||||
timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
|
timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue