From 8eeaa706ba73251063cb80d87ae838d2d5b08e9a Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 19 Nov 2025 18:27:20 +0100 Subject: [PATCH 1/4] block-backend: Fix race when resuming queued requests When new requests arrive at a BlockBackend that is currently drained, these requests are queued until the drain section ends. There is a race window between blk_root_drained_end() waking up a queued request in an iothread from the main thread and blk_wait_while_drained() actually being woken up in the iothread and calling blk_inc_in_flight(). If the BlockBackend is drained again during this window, drain won't wait for this request and it will sneak in when the BlockBackend is already supposed to be quiesced. This causes assertion failures in bdrv_drain_all_begin() and can have other unintended consequences. Fix this by increasing the in_flight counter immediately when scheduling the request to be resumed so that the next drain will wait for it to complete. Cc: qemu-stable@nongnu.org Reported-by: Andrey Drobyshev Signed-off-by: Kevin Wolf Message-ID: <20251119172720.135424-1-kwolf@redhat.com> Reviewed-by: Hanna Czenczek Tested-by: Andrey Drobyshev Reviewed-by: Fiona Ebner Signed-off-by: Kevin Wolf --- block/block-backend.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index f8d6ba65c1..d6df369188 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1318,9 +1318,9 @@ static void coroutine_fn blk_wait_while_drained(BlockBackend *blk) * section. */ qemu_mutex_lock(&blk->queued_requests_lock); + /* blk_root_drained_end() has the corresponding blk_inc_in_flight() */ blk_dec_in_flight(blk); qemu_co_queue_wait(&blk->queued_requests, &blk->queued_requests_lock); - blk_inc_in_flight(blk); qemu_mutex_unlock(&blk->queued_requests_lock); } } @@ -2767,9 +2767,11 @@ static void blk_root_drained_end(BdrvChild *child) blk->dev_ops->drained_end(blk->dev_opaque); } qemu_mutex_lock(&blk->queued_requests_lock); - while (qemu_co_enter_next(&blk->queued_requests, - &blk->queued_requests_lock)) { + while (!qemu_co_queue_empty(&blk->queued_requests)) { /* Resume all queued requests */ + blk_inc_in_flight(blk); + qemu_co_enter_next(&blk->queued_requests, + &blk->queued_requests_lock); } qemu_mutex_unlock(&blk->queued_requests_lock); } From 98e788b91ad037193b1fb375561ef7e0fef3c2fd Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 7 Oct 2025 10:16:58 -0400 Subject: [PATCH 2/4] file-posix: populate pwrite_zeroes_alignment Linux block devices require write zeroes alignment whereas files do not. It may come as a surprise that block devices opened in buffered I/O mode require the alignment for write zeroes requests although normal read/write requests do not. Therefore it is necessary to populate the pwrite_zeroes_alignment field. Cc: qemu-stable@nongnu.org Signed-off-by: Stefan Hajnoczi Message-ID: <20251007141700.71891-2-stefanha@redhat.com> Reviewed-by: Vladimir Sementsov-Ogievskiy Tested-by: Fiona Ebner Reviewed-by: Fiona Ebner Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- block/file-posix.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/block/file-posix.c b/block/file-posix.c index 12d12970fa..c9e367a222 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1611,6 +1611,22 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) bs->bl.pdiscard_alignment = dalign; } + +#ifdef __linux__ + /* + * Linux requires logical block size alignment for write zeroes even + * when normal reads/writes do not require alignment. + */ + if (!s->needs_alignment) { + ret = probe_logical_blocksize(s->fd, + &bs->bl.pwrite_zeroes_alignment); + if (ret < 0) { + error_setg_errno(errp, -ret, + "Failed to probe logical block size"); + return; + } + } +#endif /* __linux__ */ } raw_refresh_zoned_limits(bs, &st, errp); From d704a13d2c025779bc91d04e127427347ddcf3b3 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 7 Oct 2025 10:16:59 -0400 Subject: [PATCH 3/4] block: use pwrite_zeroes_alignment when writing first sector Since commit 5634622bcb33 ("file-posix: allow BLKZEROOUT with -t writeback"), qemu-img create errors out on a Linux loop block device with a 4 KB sector size: # dd if=/dev/zero of=blockfile bs=1M count=1024 # losetup --sector-size 4096 /dev/loop0 blockfile # qemu-img create -f raw /dev/loop0 1G Formatting '/dev/loop0', fmt=raw size=1073741824 qemu-img: /dev/loop0: Failed to clear the new image's first sector: Invalid argument Use the pwrite_zeroes_alignment block limit to avoid misaligned fallocate(2) or ioctl(BLKZEROOUT) in the block/file-posix.c block driver. Cc: qemu-stable@nongnu.org Fixes: 5634622bcb33 ("file-posix: allow BLKZEROOUT with -t writeback") Reported-by: Jean-Louis Dupond Buglink: https://gitlab.com/qemu-project/qemu/-/issues/3127 Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Stefan Hajnoczi Message-ID: <20251007141700.71891-3-stefanha@redhat.com> Tested-by: Fiona Ebner Reviewed-by: Fiona Ebner Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- block.c | 3 ++- block/block-backend.c | 11 +++++++++++ include/system/block-backend-io.h | 1 + 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/block.c b/block.c index 4f1581cedf..48a17f393c 100644 --- a/block.c +++ b/block.c @@ -606,12 +606,13 @@ create_file_fallback_zero_first_sector(BlockBackend *blk, int64_t current_size, Error **errp) { + uint32_t alignment = blk_get_pwrite_zeroes_alignment(blk); int64_t bytes_to_clear; int ret; GLOBAL_STATE_CODE(); - bytes_to_clear = MIN(current_size, BDRV_SECTOR_SIZE); + bytes_to_clear = MIN(current_size, MAX(BDRV_SECTOR_SIZE, alignment)); if (bytes_to_clear) { ret = blk_co_pwrite_zeroes(blk, 0, bytes_to_clear, BDRV_REQ_MAY_UNMAP); if (ret < 0) { diff --git a/block/block-backend.c b/block/block-backend.c index d6df369188..98315d4470 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -2305,6 +2305,17 @@ uint32_t blk_get_request_alignment(BlockBackend *blk) return bs ? bs->bl.request_alignment : BDRV_SECTOR_SIZE; } +/* Returns the optimal write zeroes alignment, in bytes; guaranteed nonzero */ +uint32_t blk_get_pwrite_zeroes_alignment(BlockBackend *blk) +{ + BlockDriverState *bs = blk_bs(blk); + IO_CODE(); + if (!bs) { + return BDRV_SECTOR_SIZE; + } + return bs->bl.pwrite_zeroes_alignment ?: bs->bl.request_alignment; +} + /* Returns the maximum hardware transfer length, in bytes; guaranteed nonzero */ uint64_t blk_get_max_hw_transfer(BlockBackend *blk) { diff --git a/include/system/block-backend-io.h b/include/system/block-backend-io.h index ba8dfcc7d0..6d5ac476fc 100644 --- a/include/system/block-backend-io.h +++ b/include/system/block-backend-io.h @@ -116,6 +116,7 @@ BlockAIOCB *blk_abort_aio_request(BlockBackend *blk, void *opaque, int ret); uint32_t blk_get_request_alignment(BlockBackend *blk); +uint32_t blk_get_pwrite_zeroes_alignment(BlockBackend *blk); uint32_t blk_get_max_transfer(BlockBackend *blk); uint64_t blk_get_max_hw_transfer(BlockBackend *blk); From 59a1cf0cd31597d2f6e2c18dc400a1de8427d47d Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 7 Oct 2025 10:17:00 -0400 Subject: [PATCH 4/4] iotests: add Linux loop device image creation test This qemu-iotests test case is based on the reproducer that Jean-Louis Dupond shared in https://gitlab.com/qemu-project/qemu/-/issues/3127. Signed-off-by: Stefan Hajnoczi Message-ID: <20251007141700.71891-4-stefanha@redhat.com> Reviewed-by: Vladimir Sementsov-Ogievskiy Tested-by: Vladimir Sementsov-Ogievskiy Tested-by: Fiona Ebner Reviewed-by: Fiona Ebner Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- tests/qemu-iotests/tests/loop-create-file | 59 +++++++++++++++++++ tests/qemu-iotests/tests/loop-create-file.out | 8 +++ 2 files changed, 67 insertions(+) create mode 100755 tests/qemu-iotests/tests/loop-create-file create mode 100644 tests/qemu-iotests/tests/loop-create-file.out diff --git a/tests/qemu-iotests/tests/loop-create-file b/tests/qemu-iotests/tests/loop-create-file new file mode 100755 index 0000000000..5ec75b046b --- /dev/null +++ b/tests/qemu-iotests/tests/loop-create-file @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +# group: quick +# +# SPDX-License-Identifier: GPL-2.0-or-later +# +# Copyright Red Hat, Inc. +# +# Test Linux loop device image creation +# +# This test verifies #3127 "qemu-img create fails on loop device with sector size 4096" +# https://gitlab.com/qemu-project/qemu/-/issues/3127 + +seq="$(basename $0)" +echo "QA output created by $seq" + +status=1 # failure is the default! + +_cleanup() { + if [ -n "$loopdev" ]; then + sudo losetup --detach "$loopdev" + fi + + _cleanup_test_img +} + +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +cd .. +. ./common.rc +. ./common.filter + +_supported_fmt raw +_supported_proto file +_supported_os Linux + +if ! sudo -n losetup &>/dev/null; then + _notrun "sudo losetup not available" +fi + +echo +echo "=== Create image on a 4 KB sector size loop device ===" +echo + +_make_test_img -f $IMGFMT 1M + +loopdev=$(sudo losetup --sector-size 4096 --find --show "$TEST_IMG") +if [ -z "$loopdev" ]; then + _fail +fi + +sudo $QEMU_IMG_PROG create -f raw "$loopdev" 1M | \ + sed -e "s#/dev/loop[0-9]\\+#LOOPDEV#g" + +# success, all done +echo +echo '*** done' +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/tests/loop-create-file.out b/tests/qemu-iotests/tests/loop-create-file.out new file mode 100644 index 0000000000..32d4155695 --- /dev/null +++ b/tests/qemu-iotests/tests/loop-create-file.out @@ -0,0 +1,8 @@ +QA output created by loop-create-file + +=== Create image on a 4 KB sector size loop device === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 +Formatting 'LOOPDEV', fmt=raw size=1048576 + +*** done