diff --git a/block/file-posix.c b/block/file-posix.c index c9e367a222..6265d2e248 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -4288,25 +4288,8 @@ hdev_open_Mac_error: static bool coroutine_fn sgio_path_error(int ret, sg_io_hdr_t *io_hdr) { if (ret < 0) { - switch (ret) { - case -ENODEV: - return true; - case -EAGAIN: - /* - * The device is probably suspended. This happens while the dm table - * is reloaded, e.g. because a path is added or removed. This is an - * operation that should complete within 1ms, so just wait a bit and - * retry. - * - * If the device was suspended for another reason, we'll wait and - * retry SG_IO_MAX_RETRIES times. This is a tolerable delay before - * we return an error and potentially stop the VM. - */ - qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000); - return true; - default: - return false; - } + /* Path errors sometimes result in -ENODEV */ + return ret == -ENODEV; } if (io_hdr->host_status != SCSI_HOST_OK) { @@ -4375,6 +4358,7 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) { BDRVRawState *s = bs->opaque; RawPosixAIOData acb; + uint64_t eagain_sleep_ns = 1 * SCALE_MS; int retries = SG_IO_MAX_RETRIES; int ret; @@ -4403,9 +4387,37 @@ hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) }, }; - do { - ret = raw_thread_pool_submit(handle_aiocb_ioctl, &acb); - } while (req == SG_IO && retries-- && hdev_co_ioctl_sgio_retry(&acb, ret)); +retry: + ret = raw_thread_pool_submit(handle_aiocb_ioctl, &acb); + if (req == SG_IO && s->use_mpath) { + if (ret == -EAGAIN && eagain_sleep_ns < NANOSECONDS_PER_SECOND) { + /* + * If this is a multipath device, it is probably suspended. + * + * This can happen while the dm table is reloaded, e.g. because a + * path is added or removed. This is an operation that should + * complete within 1ms, so just wait a bit and retry. + * + * There are also some cases in which libmpathpersist must recover + * from path failure during its operation, which can leave the + * device suspended for a bit longer while the library brings back + * reservations into the expected state. + * + * Use increasing delays to cover both cases without waiting + * excessively, and stop after a bit more than a second (1023 ms). + * This is a tolerable delay before we return an error and + * potentially stop the VM. + */ + qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, eagain_sleep_ns); + eagain_sleep_ns *= 2; + goto retry; + } + + /* Even for ret == 0, the SG_IO header can contain an error */ + if (retries-- && hdev_co_ioctl_sgio_retry(&acb, ret)) { + goto retry; + } + } return ret; }