From 0c1a109fe6d1bcad98f217dd9ce8767b07039df0 Mon Sep 17 00:00:00 2001 From: John Levon Date: Tue, 15 Jul 2025 13:59:51 +0200 Subject: [PATCH 1/7] =?UTF-8?q?hw/vfio-user:=20add=20C=C3=A9dric=20Le=20Go?= =?UTF-8?q?ater=20as=20a=20maintainer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: John Levon Acked-by: Mark Cave-Ayland Link: https://lore.kernel.org/qemu-devel/20250715115954.515819-2-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index e88ed2c0a9..30e9b71e6e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4287,6 +4287,7 @@ F: tests/functional/test_multiprocess.py VFIO-USER: M: John Levon M: Thanos Makatos +M: Cédric Le Goater S: Supported F: docs/interop/vfio-user.rst F: docs/system/devices/vfio-user.rst From 09353802f0021af9f13ebe9336e1994da4505626 Mon Sep 17 00:00:00 2001 From: John Levon Date: Tue, 15 Jul 2025 13:59:52 +0200 Subject: [PATCH 2/7] hw/vfio: fix region fd initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We were not initializing the region fd array to -1, so we would accidentally try to close(0) on cleanup for any region that is not referenced. Fixes: 95cdb024 ("vfio: add region info cache") Signed-off-by: John Levon Reviewed-by: Mark Cave-Ayland Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250715115954.515819-3-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio/device.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hw/vfio/device.c b/hw/vfio/device.c index 96cf21462c..52a1996dc4 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -463,6 +463,8 @@ void vfio_device_detach(VFIODevice *vbasedev) void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, struct vfio_device_info *info) { + int i; + vbasedev->num_irqs = info->num_irqs; vbasedev->num_regions = info->num_regions; vbasedev->flags = info->flags; @@ -477,6 +479,9 @@ void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, vbasedev->num_regions); if (vbasedev->use_region_fds) { vbasedev->region_fds = g_new0(int, vbasedev->num_regions); + for (i = 0; i < vbasedev->num_regions; i++) { + vbasedev->region_fds[i] = -1; + } } } @@ -489,7 +494,6 @@ void vfio_device_unprepare(VFIODevice *vbasedev) if (vbasedev->region_fds != NULL && vbasedev->region_fds[i] != -1) { close(vbasedev->region_fds[i]); } - } g_clear_pointer(&vbasedev->reginfo, g_free); From ea6788440df37495de6e257ca204cdd669d32b83 Mon Sep 17 00:00:00 2001 From: John Levon Date: Tue, 15 Jul 2025 13:59:53 +0200 Subject: [PATCH 3/7] hw/vfio-user: wait for proxy close correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Coverity reported: CID 1611806: Concurrent data access violations (BAD_CHECK_OF_WAIT_COND) A wait is performed without a loop. If there is a spurious wakeup, the condition may not be satisfied. Fix this by checking ->state for VFIO_PROXY_CLOSED in a loop. Also rename the callback for clarity. Signed-off-by: John Levon Reviewed-by: Mark Cave-Ayland Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250715115954.515819-4-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio-user/proxy.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/hw/vfio-user/proxy.c b/hw/vfio-user/proxy.c index c418954440..2275d3fe39 100644 --- a/hw/vfio-user/proxy.c +++ b/hw/vfio-user/proxy.c @@ -32,7 +32,6 @@ static void vfio_user_recycle(VFIOUserProxy *proxy, VFIOUserMsg *msg); static void vfio_user_recv(void *opaque); static void vfio_user_send(void *opaque); -static void vfio_user_cb(void *opaque); static void vfio_user_request(void *opaque); @@ -492,7 +491,7 @@ static void vfio_user_send(void *opaque) } } -static void vfio_user_cb(void *opaque) +static void vfio_user_close_cb(void *opaque) { VFIOUserProxy *proxy = opaque; @@ -984,8 +983,11 @@ void vfio_user_disconnect(VFIOUserProxy *proxy) * handler to run after the proxy fd handlers were * deleted above. */ - aio_bh_schedule_oneshot(proxy->ctx, vfio_user_cb, proxy); - qemu_cond_wait(&proxy->close_cv, &proxy->lock); + aio_bh_schedule_oneshot(proxy->ctx, vfio_user_close_cb, proxy); + + while (proxy->state != VFIO_PROXY_CLOSED) { + qemu_cond_wait(&proxy->close_cv, &proxy->lock); + } /* we now hold the only ref to proxy */ qemu_mutex_unlock(&proxy->lock); From 622740aad9f39c4266ce00d7478b32c7506e6642 Mon Sep 17 00:00:00 2001 From: John Levon Date: Tue, 15 Jul 2025 13:59:54 +0200 Subject: [PATCH 4/7] hw/vfio-user: fix use of uninitialized variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Coverity reported: CID 1611805: Uninitialized variables in vfio_user_dma_map(). This can occur in the happy path when ->async_ops was not set; as this doesn't typically happen, it wasn't caught during testing. Align both map and unmap implementations to initialize ret the same way to resolve this. Resolves: Coverity CID 1611805 Fixes: 18e899e6 ("vfio-user: implement VFIO_USER_DMA_MAP/UNMAP") Reported-by: Cédric Le Goater Signed-off-by: John Levon Reviewed-by: Cédric Le Goater Reviewed-by: Mark Cave-Ayland Link: https://lore.kernel.org/qemu-devel/20250715115954.515819-5-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio-user/container.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/hw/vfio-user/container.c b/hw/vfio-user/container.c index d318e6a339..d589dd90f5 100644 --- a/hw/vfio-user/container.c +++ b/hw/vfio-user/container.c @@ -64,8 +64,6 @@ static int vfio_user_dma_unmap(const VFIOContainerBase *bcontainer, 0, &local_err)) { error_report_err(local_err); ret = -EFAULT; - } else { - ret = 0; } } else { if (!vfio_user_send_wait(container->proxy, &msgp->hdr, NULL, @@ -92,7 +90,7 @@ static int vfio_user_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, bcontainer); int fd = memory_region_get_fd(mrp); Error *local_err = NULL; - int ret; + int ret = 0; VFIOUserFDs *fds = NULL; VFIOUserDMAMap *msgp = g_malloc0(sizeof(*msgp)); @@ -135,8 +133,6 @@ static int vfio_user_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, 0, &local_err)) { error_report_err(local_err); ret = -EFAULT; - } else { - ret = 0; } } else { VFIOUserFDs local_fds = { 1, 0, &fd }; From a59d06305fff9d10ddeeaebc66590af422362701 Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Tue, 8 Jul 2025 22:52:11 +0800 Subject: [PATCH 5/7] vfio/pci: Introduce x-pci-class-code option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce x-pci-class-code option to allow users to override PCI class code of a device, similar to the existing x-pci-vendor-id option. Only the lower 24 bits of this option are used, though a uint32 is used here for determining whether the value is valid and set by user. Additionally, to ensure VGA ranges are only exposed on VGA devices, pci_register_vga() is now called in vfio_pci_config_setup(), after the class code override is completed. This is mainly intended for IGD devices that expose themselves either as VGA controller (primary display) or Display controller (non-primary display). The UEFI GOP driver depends on the device reporting a VGA controller class code (0x030000). Signed-off-by: Tomita Moeko Reviewed-by: Alex Williamson Link: https://lore.kernel.org/qemu-devel/20250708145211.6179-1-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- hw/vfio/pci.c | 29 +++++++++++++++++++++++++---- hw/vfio/pci.h | 6 ++---- hw/vfio/trace-events | 1 + 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 1093b28df7..910042c6c2 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2893,10 +2893,6 @@ bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) "vfio-vga-io@0x3c0", QEMU_PCI_VGA_IO_HI_SIZE); - pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem, - &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, - &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem); - return true; } @@ -3228,6 +3224,23 @@ bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp) vdev->sub_device_id); } + /* + * Class code is a 24-bit value at config space 0x09. Allow overriding it + * with any 24-bit value. + */ + if (vdev->class_code != PCI_ANY_ID) { + if (vdev->class_code > 0xffffff) { + error_setg(errp, "invalid PCI class code provided"); + return false; + } + /* Higher 24 bits of PCI_CLASS_REVISION are class code */ + vfio_add_emulated_long(vdev, PCI_CLASS_REVISION, + vdev->class_code << 8, ~0xff); + trace_vfio_pci_emulated_class_code(vbasedev->name, vdev->class_code); + } else { + vdev->class_code = pci_get_long(pdev->config + PCI_CLASS_REVISION) >> 8; + } + /* QEMU can change multi-function devices to single function, or reverse */ vdev->emulated_config_bits[PCI_HEADER_TYPE] = PCI_HEADER_TYPE_MULTI_FUNCTION; @@ -3257,6 +3270,12 @@ bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp) vfio_bars_register(vdev); + if (vdev->vga && vfio_is_vga(vdev)) { + pci_register_vga(&vdev->pdev, &vdev->vga->region[QEMU_PCI_VGA_MEM].mem, + &vdev->vga->region[QEMU_PCI_VGA_IO_LO].mem, + &vdev->vga->region[QEMU_PCI_VGA_IO_HI].mem); + } + return true; } @@ -3643,6 +3662,8 @@ static const Property vfio_pci_dev_properties[] = { sub_vendor_id, PCI_ANY_ID), DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice, sub_device_id, PCI_ANY_ID), + DEFINE_PROP_UINT32("x-pci-class-code", VFIOPCIDevice, + class_code, PCI_ANY_ID), DEFINE_PROP_UINT32("x-igd-gms", VFIOPCIDevice, igd_gms, 0), DEFINE_PROP_UNSIGNED_NODEFAULT("x-nv-gpudirect-clique", VFIOPCIDevice, nv_gpudirect_clique, diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 495fae737d..4aa6461117 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -157,6 +157,7 @@ struct VFIOPCIDevice { uint32_t device_id; uint32_t sub_vendor_id; uint32_t sub_device_id; + uint32_t class_code; uint32_t features; #define VFIO_FEATURE_ENABLE_VGA_BIT 0 #define VFIO_FEATURE_ENABLE_VGA (1 << VFIO_FEATURE_ENABLE_VGA_BIT) @@ -205,10 +206,7 @@ static inline bool vfio_pci_is(VFIOPCIDevice *vdev, uint32_t vendor, uint32_t de static inline bool vfio_is_vga(VFIOPCIDevice *vdev) { - PCIDevice *pdev = &vdev->pdev; - uint16_t class = pci_get_word(pdev->config + PCI_CLASS_DEVICE); - - return class == PCI_CLASS_DISPLAY_VGA; + return (vdev->class_code >> 8) == PCI_CLASS_DISPLAY_VGA; } /* MSI/MSI-X/INTx */ diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 8ec0ad0cde..fc6ed230d0 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -48,6 +48,7 @@ vfio_pci_emulated_vendor_id(const char *name, uint16_t val) "%s 0x%04x" vfio_pci_emulated_device_id(const char *name, uint16_t val) "%s 0x%04x" vfio_pci_emulated_sub_vendor_id(const char *name, uint16_t val) "%s 0x%04x" vfio_pci_emulated_sub_device_id(const char *name, uint16_t val) "%s 0x%04x" +vfio_pci_emulated_class_code(const char *name, uint32_t val) "%s 0x%06x" # pci-quirks.c vfio_quirk_rom_in_denylist(const char *name, uint16_t vid, uint16_t did) "%s %04x:%04x" From 6380b0a02fbdac253b8a98b300398319ab655237 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 15 Jul 2025 16:37:36 +0200 Subject: [PATCH 6/7] vfio/migration: Add x-migration-load-config-after-iter VFIO property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This property allows configuring whether to start the config load only after all iterables were loaded, during non-iterables loading phase. Such interlocking is required for ARM64 due to this platform VFIO dependency on interrupt controller being loaded first. The property defaults to AUTO, which means ON for ARM, OFF for other platforms. Reviewed-by: Fabiano Rosas Reviewed-by: Avihai Horon Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/0e03c60dbc91f9a9ba2516929574df605b7dfcb4.1752589295.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- docs/devel/migration/vfio.rst | 6 +++ hw/core/machine.c | 1 + hw/vfio/helpers.c | 17 +++++++ hw/vfio/migration-multifd.c | 79 +++++++++++++++++++++++++++++++ hw/vfio/migration-multifd.h | 3 ++ hw/vfio/migration.c | 10 +++- hw/vfio/pci.c | 10 ++++ hw/vfio/vfio-helpers.h | 2 + hw/vfio/vfio-migration-internal.h | 1 + include/hw/vfio/vfio-device.h | 1 + 10 files changed, 129 insertions(+), 1 deletion(-) diff --git a/docs/devel/migration/vfio.rst b/docs/devel/migration/vfio.rst index 2d8e5ca9dd..dae3a98830 100644 --- a/docs/devel/migration/vfio.rst +++ b/docs/devel/migration/vfio.rst @@ -247,3 +247,9 @@ The multifd VFIO device state transfer is controlled by "x-migration-multifd-transfer" VFIO device property. This property defaults to AUTO, which means that VFIO device state transfer via multifd channels is attempted in configurations that otherwise support it. + +Some host platforms (like ARM64) require that VFIO device config is loaded only +after all iterables were loaded, during non-iterables loading phase. +Such interlocking is controlled by "x-migration-load-config-after-iter" VFIO +device property, which in its default setting (AUTO) does so only on platforms +that actually require it. diff --git a/hw/core/machine.c b/hw/core/machine.c index e869821b22..16640b700f 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -39,6 +39,7 @@ GlobalProperty hw_compat_10_0[] = { { "scsi-hd", "dpofua", "off" }, + { "vfio-pci", "x-migration-load-config-after-iter", "off" }, }; const size_t hw_compat_10_0_len = G_N_ELEMENTS(hw_compat_10_0); diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c index 9a5f621545..23d13e5db5 100644 --- a/hw/vfio/helpers.c +++ b/hw/vfio/helpers.c @@ -209,3 +209,20 @@ retry: return info; } + +bool vfio_arch_wants_loading_config_after_iter(void) +{ + /* + * Starting the config load only after all iterables were loaded (during + * non-iterables loading phase) is required for ARM64 due to this platform + * VFIO dependency on interrupt controller being loaded first. + * + * See commit d329f5032e17 ("vfio: Move the saving of the config space to + * the right place in VFIO migration"). + */ +#if defined(TARGET_ARM) + return true; +#else + return false; +#endif +} diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c index 55635486c8..e539befaa9 100644 --- a/hw/vfio/migration-multifd.c +++ b/hw/vfio/migration-multifd.c @@ -23,6 +23,7 @@ #include "migration-multifd.h" #include "vfio-migration-internal.h" #include "trace.h" +#include "vfio-helpers.h" #define VFIO_DEVICE_STATE_CONFIG_STATE (1) @@ -35,6 +36,18 @@ typedef struct VFIODeviceStatePacket { uint8_t data[0]; } QEMU_PACKED VFIODeviceStatePacket; +bool vfio_load_config_after_iter(VFIODevice *vbasedev) +{ + if (vbasedev->migration_load_config_after_iter == ON_OFF_AUTO_ON) { + return true; + } else if (vbasedev->migration_load_config_after_iter == ON_OFF_AUTO_OFF) { + return false; + } + + assert(vbasedev->migration_load_config_after_iter == ON_OFF_AUTO_AUTO); + return vfio_arch_wants_loading_config_after_iter(); +} + /* type safety */ typedef struct VFIOStateBuffers { GArray *array; @@ -50,6 +63,9 @@ typedef struct VFIOMultifd { bool load_bufs_thread_running; bool load_bufs_thread_want_exit; + bool load_bufs_iter_done; + QemuCond load_bufs_iter_done_cond; + VFIOStateBuffers load_bufs; QemuCond load_bufs_buffer_ready_cond; QemuCond load_bufs_thread_finished_cond; @@ -394,6 +410,22 @@ static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp) multifd->load_buf_idx++; } + if (vfio_load_config_after_iter(vbasedev)) { + while (!multifd->load_bufs_iter_done) { + qemu_cond_wait(&multifd->load_bufs_iter_done_cond, + &multifd->load_bufs_mutex); + + /* + * Need to re-check cancellation immediately after wait in case + * cond was signalled by vfio_load_cleanup_load_bufs_thread(). + */ + if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) { + error_setg(errp, "operation cancelled"); + goto thread_exit; + } + } + } + if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) { goto thread_exit; } @@ -413,6 +445,48 @@ thread_exit: return ret; } +int vfio_load_state_config_load_ready(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + VFIOMultifd *multifd = migration->multifd; + int ret = 0; + + if (!vfio_multifd_transfer_enabled(vbasedev)) { + error_report("%s: got DEV_CONFIG_LOAD_READY outside multifd transfer", + vbasedev->name); + return -EINVAL; + } + + if (!vfio_load_config_after_iter(vbasedev)) { + error_report("%s: got DEV_CONFIG_LOAD_READY but was disabled", + vbasedev->name); + return -EINVAL; + } + + assert(multifd); + + /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ + bql_unlock(); + WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { + if (multifd->load_bufs_iter_done) { + /* Can't print error here as we're outside BQL */ + ret = -EINVAL; + break; + } + + multifd->load_bufs_iter_done = true; + qemu_cond_signal(&multifd->load_bufs_iter_done_cond); + } + bql_lock(); + + if (ret) { + error_report("%s: duplicate DEV_CONFIG_LOAD_READY", + vbasedev->name); + } + + return ret; +} + static VFIOMultifd *vfio_multifd_new(void) { VFIOMultifd *multifd = g_new(VFIOMultifd, 1); @@ -425,6 +499,9 @@ static VFIOMultifd *vfio_multifd_new(void) multifd->load_buf_idx_last = UINT32_MAX; qemu_cond_init(&multifd->load_bufs_buffer_ready_cond); + multifd->load_bufs_iter_done = false; + qemu_cond_init(&multifd->load_bufs_iter_done_cond); + multifd->load_bufs_thread_running = false; multifd->load_bufs_thread_want_exit = false; qemu_cond_init(&multifd->load_bufs_thread_finished_cond); @@ -448,6 +525,7 @@ static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd) multifd->load_bufs_thread_want_exit = true; qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); + qemu_cond_signal(&multifd->load_bufs_iter_done_cond); qemu_cond_wait(&multifd->load_bufs_thread_finished_cond, &multifd->load_bufs_mutex); } @@ -460,6 +538,7 @@ static void vfio_multifd_free(VFIOMultifd *multifd) vfio_load_cleanup_load_bufs_thread(multifd); qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond); + qemu_cond_destroy(&multifd->load_bufs_iter_done_cond); vfio_state_buffers_destroy(&multifd->load_bufs); qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond); qemu_mutex_destroy(&multifd->load_bufs_mutex); diff --git a/hw/vfio/migration-multifd.h b/hw/vfio/migration-multifd.h index ebf22a7997..82d2d3a1fd 100644 --- a/hw/vfio/migration-multifd.h +++ b/hw/vfio/migration-multifd.h @@ -20,9 +20,12 @@ void vfio_multifd_cleanup(VFIODevice *vbasedev); bool vfio_multifd_transfer_supported(void); bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev); +bool vfio_load_config_after_iter(VFIODevice *vbasedev); bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, Error **errp); +int vfio_load_state_config_load_ready(VFIODevice *vbasedev); + void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f); bool diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index c329578eec..4c06e3db93 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -675,7 +675,11 @@ static void vfio_save_state(QEMUFile *f, void *opaque) int ret; if (vfio_multifd_transfer_enabled(vbasedev)) { - vfio_multifd_emit_dummy_eos(vbasedev, f); + if (vfio_load_config_after_iter(vbasedev)) { + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_LOAD_READY); + } else { + vfio_multifd_emit_dummy_eos(vbasedev, f); + } return; } @@ -784,6 +788,10 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) return ret; } + case VFIO_MIG_FLAG_DEV_CONFIG_LOAD_READY: + { + return vfio_load_state_config_load_ready(vbasedev); + } default: error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data); return -EINVAL; diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 910042c6c2..09acad002a 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3642,6 +3642,9 @@ static const Property vfio_pci_dev_properties[] = { vbasedev.migration_multifd_transfer, vfio_pci_migration_multifd_transfer_prop, OnOffAuto, .set_default = true, .defval.i = ON_OFF_AUTO_AUTO), + DEFINE_PROP_ON_OFF_AUTO("x-migration-load-config-after-iter", VFIOPCIDevice, + vbasedev.migration_load_config_after_iter, + ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice, vbasedev.migration_events, false), DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), @@ -3818,6 +3821,13 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data) "x-migration-multifd-transfer", "Transfer this device state via " "multifd channels when live migrating it"); + object_class_property_set_description(klass, /* 10.1 */ + "x-migration-load-config-after-iter", + "Start the config load only after " + "all iterables were loaded (during " + "non-iterables loading phase) when " + "doing live migration of device state " + "via multifd channels"); } static const TypeInfo vfio_pci_dev_info = { diff --git a/hw/vfio/vfio-helpers.h b/hw/vfio/vfio-helpers.h index 54a327ffbc..ce31758080 100644 --- a/hw/vfio/vfio-helpers.h +++ b/hw/vfio/vfio-helpers.h @@ -32,4 +32,6 @@ struct vfio_device_info *vfio_get_device_info(int fd); int vfio_kvm_device_add_fd(int fd, Error **errp); int vfio_kvm_device_del_fd(int fd, Error **errp); +bool vfio_arch_wants_loading_config_after_iter(void); + #endif /* HW_VFIO_VFIO_HELPERS_H */ diff --git a/hw/vfio/vfio-migration-internal.h b/hw/vfio/vfio-migration-internal.h index a8b456b239..54141e27e6 100644 --- a/hw/vfio/vfio-migration-internal.h +++ b/hw/vfio/vfio-migration-internal.h @@ -32,6 +32,7 @@ #define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) #define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) #define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL) +#define VFIO_MIG_FLAG_DEV_CONFIG_LOAD_READY (0xffffffffef100006ULL) typedef struct VFIODevice VFIODevice; typedef struct VFIOMultifd VFIOMultifd; diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index 1901a35aa9..dac3fdce15 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -67,6 +67,7 @@ typedef struct VFIODevice { bool ram_block_discard_allowed; OnOffAuto enable_migration; OnOffAuto migration_multifd_transfer; + OnOffAuto migration_load_config_after_iter; bool migration_events; bool use_region_fds; VFIODeviceOps *ops; From 300dcf58b72fa1635190b19f102231b0775e93cb Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 15 Jul 2025 16:37:37 +0200 Subject: [PATCH 7/7] vfio/migration: Max in-flight VFIO device state buffers size limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow capping the maximum total size of in-flight VFIO device state buffers queued at the destination, otherwise a malicious QEMU source could theoretically cause the target QEMU to allocate unlimited amounts of memory for buffers-in-flight. Since this is not expected to be a realistic threat in most of VFIO live migration use cases and the right value depends on the particular setup disable this limit by default by setting it to UINT64_MAX. Reviewed-by: Fabiano Rosas Reviewed-by: Avihai Horon Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/qemu-devel/4f7cad490988288f58e36b162d7a888ed7e7fd17.1752589295.git.maciej.szmigiero@oracle.com Signed-off-by: Cédric Le Goater --- docs/devel/migration/vfio.rst | 13 +++++++++++++ hw/vfio/migration-multifd.c | 21 +++++++++++++++++++-- hw/vfio/pci.c | 9 +++++++++ include/hw/vfio/vfio-device.h | 1 + 4 files changed, 42 insertions(+), 2 deletions(-) diff --git a/docs/devel/migration/vfio.rst b/docs/devel/migration/vfio.rst index dae3a98830..0790e5031d 100644 --- a/docs/devel/migration/vfio.rst +++ b/docs/devel/migration/vfio.rst @@ -248,6 +248,19 @@ The multifd VFIO device state transfer is controlled by AUTO, which means that VFIO device state transfer via multifd channels is attempted in configurations that otherwise support it. +Since the target QEMU needs to load device state buffers in-order it needs to +queue incoming buffers until they can be loaded into the device. +This means that a malicious QEMU source could theoretically cause the target +QEMU to allocate unlimited amounts of memory for such buffers-in-flight. + +The "x-migration-max-queued-buffers-size" property allows capping the total size +of these VFIO device state buffers queued at the destination. + +Because a malicious QEMU source causing OOM on the target is not expected to be +a realistic threat in most of VFIO live migration use cases and the right value +depends on the particular setup by default this queued buffers size limit is +disabled by setting it to UINT64_MAX. + Some host platforms (like ARM64) require that VFIO device config is loaded only after all iterables were loaded, during non-iterables loading phase. Such interlocking is controlled by "x-migration-load-config-after-iter" VFIO diff --git a/hw/vfio/migration-multifd.c b/hw/vfio/migration-multifd.c index e539befaa9..d522671b8d 100644 --- a/hw/vfio/migration-multifd.c +++ b/hw/vfio/migration-multifd.c @@ -72,6 +72,7 @@ typedef struct VFIOMultifd { QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */ uint32_t load_buf_idx; uint32_t load_buf_idx_last; + size_t load_buf_queued_pending_buffers_size; } VFIOMultifd; static void vfio_state_buffer_clear(gpointer data) @@ -128,6 +129,7 @@ static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev, VFIOMigration *migration = vbasedev->migration; VFIOMultifd *multifd = migration->multifd; VFIOStateBuffer *lb; + size_t data_size = packet_total_size - sizeof(*packet); vfio_state_buffers_assert_init(&multifd->load_bufs); if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) { @@ -143,8 +145,19 @@ static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev, assert(packet->idx >= multifd->load_buf_idx); - lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet)); - lb->len = packet_total_size - sizeof(*packet); + multifd->load_buf_queued_pending_buffers_size += data_size; + if (multifd->load_buf_queued_pending_buffers_size > + vbasedev->migration_max_queued_buffers_size) { + error_setg(errp, + "%s: queuing state buffer %" PRIu32 + " would exceed the size max of %" PRIu64, + vbasedev->name, packet->idx, + vbasedev->migration_max_queued_buffers_size); + return false; + } + + lb->data = g_memdup2(&packet->data, data_size); + lb->len = data_size; lb->is_present = true; return true; @@ -328,6 +341,9 @@ static bool vfio_load_state_buffer_write(VFIODevice *vbasedev, assert(wr_ret <= buf_len); buf_len -= wr_ret; buf_cur += wr_ret; + + assert(multifd->load_buf_queued_pending_buffers_size >= wr_ret); + multifd->load_buf_queued_pending_buffers_size -= wr_ret; } trace_vfio_load_state_device_buffer_load_end(vbasedev->name, @@ -497,6 +513,7 @@ static VFIOMultifd *vfio_multifd_new(void) multifd->load_buf_idx = 0; multifd->load_buf_idx_last = UINT32_MAX; + multifd->load_buf_queued_pending_buffers_size = 0; qemu_cond_init(&multifd->load_bufs_buffer_ready_cond); multifd->load_bufs_iter_done = false; diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 09acad002a..be05002b98 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3645,6 +3645,8 @@ static const Property vfio_pci_dev_properties[] = { DEFINE_PROP_ON_OFF_AUTO("x-migration-load-config-after-iter", VFIOPCIDevice, vbasedev.migration_load_config_after_iter, ON_OFF_AUTO_AUTO), + DEFINE_PROP_SIZE("x-migration-max-queued-buffers-size", VFIOPCIDevice, + vbasedev.migration_max_queued_buffers_size, UINT64_MAX), DEFINE_PROP_BOOL("migration-events", VFIOPCIDevice, vbasedev.migration_events, false), DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), @@ -3828,6 +3830,13 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data) "non-iterables loading phase) when " "doing live migration of device state " "via multifd channels"); + object_class_property_set_description(klass, /* 10.1 */ + "x-migration-max-queued-buffers-size", + "Maximum size of in-flight VFIO " + "device state buffers queued at the " + "destination when doing live " + "migration of device state via " + "multifd channels"); } static const TypeInfo vfio_pci_dev_info = { diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index dac3fdce15..6e4d5ccdac 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -68,6 +68,7 @@ typedef struct VFIODevice { OnOffAuto enable_migration; OnOffAuto migration_multifd_transfer; OnOffAuto migration_load_config_after_iter; + uint64_t migration_max_queued_buffers_size; bool migration_events; bool use_region_fds; VFIODeviceOps *ops;