From 144227dbc46f82d76de065e507a256a4d723b9e4 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Mon, 9 Jun 2025 19:54:33 +0800 Subject: [PATCH 01/27] vfio/container: Fix vfio_listener_commit() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's wrong to call into listener_begin callback in vfio_listener_commit(). Currently this impacts vfio-user. Fixes: d9b7d8b6993b ("vfio/container: pass listener_begin/commit callbacks") Signed-off-by: Zhenzhong Duan Reviewed-by: John Levon Link: https://lore.kernel.org/qemu-devel/20250609115433.401775-1-zhenzhong.duan@intel.com Signed-off-by: Cédric Le Goater --- hw/vfio/listener.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c index 203ed0314e..735b5f21b7 100644 --- a/hw/vfio/listener.c +++ b/hw/vfio/listener.c @@ -437,7 +437,7 @@ static void vfio_listener_commit(MemoryListener *listener) listener); void (*listener_commit)(VFIOContainerBase *bcontainer); - listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin; + listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_commit; if (listener_commit) { listener_commit(bcontainer); From 6c4f752ea7c57b9782da0669599659a9d63df431 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 11 Jun 2025 10:42:28 +0800 Subject: [PATCH 02/27] vfio/pci: Fix instance_size of VFIO_PCI_BASE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the final instance_size of VFIO_PCI_BASE is sizeof(PCIDevice). It should be sizeof(VFIOPCIDevice), VFIO_PCI uses same structure as base class VFIO_PCI_BASE, so no need to set its instance_size explicitly. This isn't catastrophic only because VFIO_PCI_BASE is an abstract class. Fixes: d4e392d0a99b ("vfio: add vfio-pci-base class") Signed-off-by: Zhenzhong Duan Reviewed-by: John Levon Reviewed-by: Cédric Le Goater Reviewed-by: Yi Liu Link: https://lore.kernel.org/qemu-devel/20250611024228.423666-1-zhenzhong.duan@intel.com Signed-off-by: Cédric Le Goater --- hw/vfio/pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index b1250d85bf..6748f4e876 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3425,7 +3425,7 @@ static void vfio_pci_base_dev_class_init(ObjectClass *klass, const void *data) static const TypeInfo vfio_pci_base_dev_info = { .name = TYPE_VFIO_PCI_BASE, .parent = TYPE_PCI_DEVICE, - .instance_size = 0, + .instance_size = sizeof(VFIOPCIDevice), .abstract = true, .class_init = vfio_pci_base_dev_class_init, .interfaces = (const InterfaceInfo[]) { @@ -3647,7 +3647,6 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data) static const TypeInfo vfio_pci_dev_info = { .name = TYPE_VFIO_PCI, .parent = TYPE_VFIO_PCI_BASE, - .instance_size = sizeof(VFIOPCIDevice), .class_init = vfio_pci_dev_class_init, .instance_init = vfio_instance_init, .instance_finalize = vfio_instance_finalize, From 0fb8a62fe46ca0cdcc75479658301b7e1c3aa797 Mon Sep 17 00:00:00 2001 From: Rorie Reyes Date: Mon, 9 Jun 2025 12:44:15 -0400 Subject: [PATCH 03/27] hw/vfio/ap: notification handler for AP config changed event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Register an event notifier handler to process AP configuration change events by queuing the event and generating a CRW to let the guest know its AP configuration has changed Signed-off-by: Rorie Reyes Reviewed-by: Anthony Krowiak Link: https://lore.kernel.org/qemu-devel/20250609164418.17585-2-rreyes@linux.ibm.com Signed-off-by: Cédric Le Goater --- hw/vfio/ap.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 785c0a0197..93c74ebedb 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -18,6 +18,7 @@ #include "hw/vfio/vfio-device.h" #include "system/iommufd.h" #include "hw/s390x/ap-device.h" +#include "hw/s390x/css.h" #include "qemu/error-report.h" #include "qemu/event_notifier.h" #include "qemu/main-loop.h" @@ -37,6 +38,7 @@ struct VFIOAPDevice { APDevice apdev; VFIODevice vdev; EventNotifier req_notifier; + EventNotifier cfg_notifier; }; OBJECT_DECLARE_SIMPLE_TYPE(VFIOAPDevice, VFIO_AP_DEVICE) @@ -70,6 +72,18 @@ static void vfio_ap_req_notifier_handler(void *opaque) } } +static void vfio_ap_cfg_chg_notifier_handler(void *opaque) +{ + VFIOAPDevice *vapdev = opaque; + + if (!event_notifier_test_and_clear(&vapdev->cfg_notifier)) { + return; + } + + css_generate_css_crws(0); + +} + static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev, unsigned int irq, Error **errp) { @@ -85,6 +99,10 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev, notifier = &vapdev->req_notifier; fd_read = vfio_ap_req_notifier_handler; break; + case VFIO_AP_CFG_CHG_IRQ_INDEX: + notifier = &vapdev->cfg_notifier; + fd_read = vfio_ap_cfg_chg_notifier_handler; + break; default: error_setg(errp, "vfio: Unsupported device irq(%d)", irq); return false; @@ -137,6 +155,9 @@ static void vfio_ap_unregister_irq_notifier(VFIOAPDevice *vapdev, case VFIO_AP_REQ_IRQ_INDEX: notifier = &vapdev->req_notifier; break; + case VFIO_AP_CFG_CHG_IRQ_INDEX: + notifier = &vapdev->cfg_notifier; + break; default: error_report("vfio: Unsupported device irq(%d)", irq); return; @@ -176,6 +197,15 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) warn_report_err(err); } + if (!vfio_ap_register_irq_notifier(vapdev, VFIO_AP_CFG_CHG_IRQ_INDEX, &err)) + { + /* + * Report this error, but do not make it a failing condition. + * Lack of this IRQ in the host does not prevent normal operation. + */ + warn_report_err(err); + } + return; error: @@ -188,6 +218,7 @@ static void vfio_ap_unrealize(DeviceState *dev) VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev); vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_REQ_IRQ_INDEX); + vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_CFG_CHG_IRQ_INDEX); vfio_device_detach(&vapdev->vdev); g_free(vapdev->vdev.name); } From bc36d14e13ee9bfc17a6818c5e1664fb8e316621 Mon Sep 17 00:00:00 2001 From: Rorie Reyes Date: Mon, 9 Jun 2025 12:44:16 -0400 Subject: [PATCH 04/27] hw/vfio/ap: store object indicating AP config changed in a queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Creates an object indicating that an AP configuration change event has been received and stores it in a queue. These objects will later be used to store event information for an AP configuration change when the CHSC instruction is intercepted. Signed-off-by: Rorie Reyes Reviewed-by: Anthony Krowiak Link: https://lore.kernel.org/qemu-devel/20250609164418.17585-3-rreyes@linux.ibm.com Signed-off-by: Cédric Le Goater --- hw/vfio/ap.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 93c74ebedb..681fd4a4f1 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -21,6 +21,7 @@ #include "hw/s390x/css.h" #include "qemu/error-report.h" #include "qemu/event_notifier.h" +#include "qemu/lockable.h" #include "qemu/main-loop.h" #include "qemu/module.h" #include "qemu/option.h" @@ -41,6 +42,15 @@ struct VFIOAPDevice { EventNotifier cfg_notifier; }; +typedef struct APConfigChgEvent { + QTAILQ_ENTRY(APConfigChgEvent) next; +} APConfigChgEvent; + +static QTAILQ_HEAD(, APConfigChgEvent) cfg_chg_events = + QTAILQ_HEAD_INITIALIZER(cfg_chg_events); + +static QemuMutex cfg_chg_events_lock; + OBJECT_DECLARE_SIMPLE_TYPE(VFIOAPDevice, VFIO_AP_DEVICE) static void vfio_ap_compute_needs_reset(VFIODevice *vdev) @@ -74,12 +84,19 @@ static void vfio_ap_req_notifier_handler(void *opaque) static void vfio_ap_cfg_chg_notifier_handler(void *opaque) { + APConfigChgEvent *cfg_chg_event; VFIOAPDevice *vapdev = opaque; if (!event_notifier_test_and_clear(&vapdev->cfg_notifier)) { return; } + cfg_chg_event = g_new0(APConfigChgEvent, 1); + + WITH_QEMU_LOCK_GUARD(&cfg_chg_events_lock) { + QTAILQ_INSERT_TAIL(&cfg_chg_events, cfg_chg_event, next); + } + css_generate_css_crws(0); } From fd0336021553dfcccb92522552d0ac9475e831d8 Mon Sep 17 00:00:00 2001 From: Rorie Reyes Date: Mon, 9 Jun 2025 12:44:17 -0400 Subject: [PATCH 05/27] hw/vfio/ap: Storing event information for an AP configuration change event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These functions can be invoked by the function that handles interception of the CHSC SEI instruction for requests indicating the accessibility of one or more adjunct processors has changed. Signed-off-by: Rorie Reyes Reviewed-by: Anthony Krowiak Link: https://lore.kernel.org/qemu-devel/20250609164418.17585-4-rreyes@linux.ibm.com Signed-off-by: Cédric Le Goater --- hw/vfio/ap.c | 40 ++++++++++++++++++++++++++++++++++++ include/hw/s390x/ap-bridge.h | 39 +++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 681fd4a4f1..874e0d1eaf 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -10,6 +10,7 @@ * directory. */ +#include #include "qemu/osdep.h" #include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include @@ -101,6 +102,38 @@ static void vfio_ap_cfg_chg_notifier_handler(void *opaque) } +int ap_chsc_sei_nt0_get_event(void *res) +{ + ChscSeiNt0Res *nt0_res = (ChscSeiNt0Res *)res; + APConfigChgEvent *cfg_chg_event; + + WITH_QEMU_LOCK_GUARD(&cfg_chg_events_lock) { + if (QTAILQ_EMPTY(&cfg_chg_events)) { + return EVENT_INFORMATION_NOT_STORED; + } + + cfg_chg_event = QTAILQ_FIRST(&cfg_chg_events); + QTAILQ_REMOVE(&cfg_chg_events, cfg_chg_event, next); + } + + memset(nt0_res, 0, sizeof(*nt0_res)); + g_free(cfg_chg_event); + nt0_res->flags |= PENDING_EVENT_INFO_BITMASK; + nt0_res->length = sizeof(ChscSeiNt0Res); + nt0_res->code = NT0_RES_RESPONSE_CODE; + nt0_res->nt = NT0_RES_NT_DEFAULT; + nt0_res->rs = NT0_RES_RS_AP_CHANGE; + nt0_res->cc = NT0_RES_CC_AP_CHANGE; + + return EVENT_INFORMATION_STORED; +} + +bool ap_chsc_sei_nt0_have_event(void) +{ + QEMU_LOCK_GUARD(&cfg_chg_events_lock); + return !QTAILQ_EMPTY(&cfg_chg_events); +} + static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev, unsigned int irq, Error **errp) { @@ -197,6 +230,13 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp) VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev); VFIODevice *vbasedev = &vapdev->vdev; + static bool lock_initialized; + + if (!lock_initialized) { + qemu_mutex_init(&cfg_chg_events_lock); + lock_initialized = true; + } + if (!vfio_device_get_name(vbasedev, errp)) { return; } diff --git a/include/hw/s390x/ap-bridge.h b/include/hw/s390x/ap-bridge.h index 470e439a98..7efc52928d 100644 --- a/include/hw/s390x/ap-bridge.h +++ b/include/hw/s390x/ap-bridge.h @@ -16,4 +16,43 @@ void s390_init_ap(void); +typedef struct ChscSeiNt0Res { + uint16_t length; + uint16_t code; + uint8_t reserved1; + uint16_t reserved2; + uint8_t nt; +#define PENDING_EVENT_INFO_BITMASK 0x80; + uint8_t flags; + uint8_t reserved3; + uint8_t rs; + uint8_t cc; +} QEMU_PACKED ChscSeiNt0Res; + +#define NT0_RES_RESPONSE_CODE 1 +#define NT0_RES_NT_DEFAULT 0 +#define NT0_RES_RS_AP_CHANGE 5 +#define NT0_RES_CC_AP_CHANGE 3 + +#define EVENT_INFORMATION_NOT_STORED 1 +#define EVENT_INFORMATION_STORED 0 + +/** + * ap_chsc_sei_nt0_get_event - Retrieve the next pending AP config + * change event + * @res: Pointer to a ChscSeiNt0Res struct to be filled with event + * data + * + * This function checks for any pending AP config change events and, + * if present, populates the provided response structure with the + * appropriate SEI NT0 fields. + * + * Return: + * EVENT_INFORMATION_STORED - An event was available and written to @res + * EVENT_INFORMATION_NOT_STORED - No event was available + */ +int ap_chsc_sei_nt0_get_event(void *res); + +bool ap_chsc_sei_nt0_have_event(void); + #endif From c393e6d181c98b76b09fb5120c68f458d5a05d4b Mon Sep 17 00:00:00 2001 From: Rorie Reyes Date: Mon, 9 Jun 2025 12:44:18 -0400 Subject: [PATCH 06/27] s390: implementing CHSC SEI for AP config change MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Handle interception of the CHSC SEI instruction for requests indicating the guest's AP configuration has changed. If configuring --without-default-devices, hw/s390x/ap-stub.c was created to handle such circumstance. Also added the following to hw/s390x/meson.build if CONFIG_VFIO_AP is false, it will use the stub file. Signed-off-by: Rorie Reyes Reviewed-by: Anthony Krowiak Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250609164418.17585-5-rreyes@linux.ibm.com Signed-off-by: Cédric Le Goater --- MAINTAINERS | 1 + hw/s390x/ap-stub.c | 21 +++++++++++++++++++++ hw/s390x/meson.build | 1 + target/s390x/ioinst.c | 11 +++++++++-- 4 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 hw/s390x/ap-stub.c diff --git a/MAINTAINERS b/MAINTAINERS index aa6763077e..1e84bfeaee 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -112,6 +112,7 @@ F: hw/intc/s390_flic.c F: hw/intc/s390_flic_kvm.c F: hw/s390x/ F: hw/vfio/ap.c +F: hw/s390x/ap-stub.c F: hw/vfio/ccw.c F: hw/watchdog/wdt_diag288.c F: include/hw/s390x/ diff --git a/hw/s390x/ap-stub.c b/hw/s390x/ap-stub.c new file mode 100644 index 0000000000..001fe5f8b0 --- /dev/null +++ b/hw/s390x/ap-stub.c @@ -0,0 +1,21 @@ +/* + * VFIO based AP matrix device assignment + * + * Copyright 2025 IBM Corp. + * Author(s): Rorie Reyes + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "hw/s390x/ap-bridge.h" + +int ap_chsc_sei_nt0_get_event(void *res) +{ + return EVENT_INFORMATION_NOT_STORED; +} + +bool ap_chsc_sei_nt0_have_event(void) +{ + return false; +} diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build index 3bbebfd817..99cbcbd7d6 100644 --- a/hw/s390x/meson.build +++ b/hw/s390x/meson.build @@ -33,6 +33,7 @@ s390x_ss.add(when: 'CONFIG_S390_CCW_VIRTIO', if_true: files( )) s390x_ss.add(when: 'CONFIG_TERMINAL3270', if_true: files('3270-ccw.c')) s390x_ss.add(when: 'CONFIG_VFIO', if_true: files('s390-pci-vfio.c')) +s390x_ss.add(when: 'CONFIG_VFIO_AP', if_false: files('ap-stub.c')) virtio_ss = ss.source_set() virtio_ss.add(files('virtio-ccw.c')) diff --git a/target/s390x/ioinst.c b/target/s390x/ioinst.c index fe62ba5b06..2320dd4c12 100644 --- a/target/s390x/ioinst.c +++ b/target/s390x/ioinst.c @@ -18,6 +18,7 @@ #include "trace.h" #include "hw/s390x/s390-pci-bus.h" #include "target/s390x/kvm/pv.h" +#include "hw/s390x/ap-bridge.h" /* All I/O instructions but chsc use the s format */ static uint64_t get_address_from_regs(CPUS390XState *env, uint32_t ipb, @@ -574,13 +575,19 @@ out: static int chsc_sei_nt0_get_event(void *res) { - /* no events yet */ + if (s390_has_feat(S390_FEAT_AP)) { + return ap_chsc_sei_nt0_get_event(res); + } + return 1; } static int chsc_sei_nt0_have_event(void) { - /* no events yet */ + if (s390_has_feat(S390_FEAT_AP)) { + return ap_chsc_sei_nt0_have_event(); + } + return 0; } From 7163c0bca72ae70a5be2d455c5348255a6016f04 Mon Sep 17 00:00:00 2001 From: John Levon Date: Fri, 6 Jun 2025 17:10:33 -0700 Subject: [PATCH 07/27] vfio: export PCI helpers needed for vfio-user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vfio-user code will need to re-use various parts of the vfio PCI code. Export them in hw/vfio/pci.h, and rename them to the vfio_pci_* namespace. Signed-off-by: John Levon Reviewed-by: Mark Cave-Ayland Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250607001056.335310-2-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio/pci.c | 48 ++++++++++++++++++++++---------------------- hw/vfio/pci.h | 11 ++++++++++ hw/vfio/trace-events | 6 +++--- 3 files changed, 38 insertions(+), 27 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 6748f4e876..2901cedf6f 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -103,7 +103,7 @@ static void vfio_intx_interrupt(void *opaque) } } -static void vfio_intx_eoi(VFIODevice *vbasedev) +void vfio_pci_intx_eoi(VFIODevice *vbasedev) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); @@ -111,7 +111,7 @@ static void vfio_intx_eoi(VFIODevice *vbasedev) return; } - trace_vfio_intx_eoi(vbasedev->name); + trace_vfio_pci_intx_eoi(vbasedev->name); vdev->intx.pending = false; pci_irq_deassert(&vdev->pdev); @@ -236,7 +236,7 @@ static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route) } /* Re-enable the interrupt in cased we missed an EOI */ - vfio_intx_eoi(&vdev->vbasedev); + vfio_pci_intx_eoi(&vdev->vbasedev); } static void vfio_intx_routing_notifier(PCIDevice *pdev) @@ -1743,7 +1743,7 @@ static bool vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) return true; } -static void vfio_teardown_msi(VFIOPCIDevice *vdev) +void vfio_pci_teardown_msi(VFIOPCIDevice *vdev) { msi_uninit(&vdev->pdev); @@ -1839,7 +1839,7 @@ static void vfio_bars_register(VFIOPCIDevice *vdev) } } -static void vfio_bars_exit(VFIOPCIDevice *vdev) +void vfio_pci_bars_exit(VFIOPCIDevice *vdev) { int i; @@ -2430,7 +2430,7 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev) g_free(config); } -static bool vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_add_capabilities(VFIOPCIDevice *vdev, Error **errp) { PCIDevice *pdev = &vdev->pdev; @@ -2706,7 +2706,7 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) static VFIODeviceOps vfio_pci_ops = { .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, - .vfio_eoi = vfio_intx_eoi, + .vfio_eoi = vfio_pci_intx_eoi, .vfio_get_object = vfio_pci_get_object, .vfio_save_config = vfio_pci_save_config, .vfio_load_config = vfio_pci_load_config, @@ -2777,7 +2777,7 @@ bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) return true; } -static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp) { VFIODevice *vbasedev = &vdev->vbasedev; struct vfio_region_info *reg_info = NULL; @@ -2823,7 +2823,7 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) return false; } - trace_vfio_populate_device_config(vdev->vbasedev.name, + trace_vfio_pci_populate_device_config(vdev->vbasedev.name, (unsigned long)reg_info->size, (unsigned long)reg_info->offset, (unsigned long)reg_info->flags); @@ -2845,7 +2845,7 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info); if (ret) { /* This can fail for an old kernel or legacy PCI dev */ - trace_vfio_populate_device_get_irq_info_failure(strerror(-ret)); + trace_vfio_pci_populate_device_get_irq_info_failure(strerror(-ret)); } else if (irq_info.count == 1) { vdev->pci_aer = true; } else { @@ -2857,7 +2857,7 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) return true; } -static void vfio_pci_put_device(VFIOPCIDevice *vdev) +void vfio_pci_put_device(VFIOPCIDevice *vdev) { vfio_display_finalize(vdev); vfio_bars_finalize(vdev); @@ -2905,7 +2905,7 @@ static void vfio_err_notifier_handler(void *opaque) * and continue after disabling error recovery support for the * device. */ -static void vfio_register_err_notifier(VFIOPCIDevice *vdev) +void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev) { Error *err = NULL; int32_t fd; @@ -2964,7 +2964,7 @@ static void vfio_req_notifier_handler(void *opaque) } } -static void vfio_register_req_notifier(VFIOPCIDevice *vdev) +void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev) { struct vfio_irq_info irq_info; Error *err = NULL; @@ -3018,7 +3018,7 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) vdev->req_enabled = false; } -static bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp) { PCIDevice *pdev = &vdev->pdev; VFIODevice *vbasedev = &vdev->vbasedev; @@ -3124,7 +3124,7 @@ static bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp) return true; } -static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) +bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) { PCIDevice *pdev = &vdev->pdev; @@ -3214,7 +3214,7 @@ static void vfio_pci_realize(PCIDevice *pdev, Error **errp) goto error; } - if (!vfio_populate_device(vdev, errp)) { + if (!vfio_pci_populate_device(vdev, errp)) { goto error; } @@ -3228,7 +3228,7 @@ static void vfio_pci_realize(PCIDevice *pdev, Error **errp) goto out_teardown; } - if (!vfio_add_capabilities(vdev, errp)) { + if (!vfio_pci_add_capabilities(vdev, errp)) { goto out_unset_idev; } @@ -3244,7 +3244,7 @@ static void vfio_pci_realize(PCIDevice *pdev, Error **errp) vfio_bar_quirk_setup(vdev, i); } - if (!vfio_interrupt_setup(vdev, errp)) { + if (!vfio_pci_interrupt_setup(vdev, errp)) { goto out_unset_idev; } @@ -3288,8 +3288,8 @@ static void vfio_pci_realize(PCIDevice *pdev, Error **errp) } } - vfio_register_err_notifier(vdev); - vfio_register_req_notifier(vdev); + vfio_pci_register_err_notifier(vdev); + vfio_pci_register_req_notifier(vdev); vfio_setup_resetfn_quirk(vdev); return; @@ -3310,8 +3310,8 @@ out_unset_idev: pci_device_unset_iommu_device(pdev); } out_teardown: - vfio_teardown_msi(vdev); - vfio_bars_exit(vdev); + vfio_pci_teardown_msi(vdev); + vfio_pci_bars_exit(vdev); error: error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name); } @@ -3338,9 +3338,9 @@ static void vfio_exitfn(PCIDevice *pdev) if (vdev->intx.mmap_timer) { timer_free(vdev->intx.mmap_timer); } - vfio_teardown_msi(vdev); + vfio_pci_teardown_msi(vdev); vfio_pci_disable_rp_atomics(vdev); - vfio_bars_exit(vdev); + vfio_pci_bars_exit(vdev); vfio_migration_exit(vbasedev); if (!vbasedev->mdev) { pci_device_unset_iommu_device(pdev); diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 5ce0fb916f..d4c6b2e7b7 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -248,4 +248,15 @@ void vfio_display_finalize(VFIOPCIDevice *vdev); extern const VMStateDescription vfio_display_vmstate; +void vfio_pci_bars_exit(VFIOPCIDevice *vdev); +bool vfio_pci_add_capabilities(VFIOPCIDevice *vdev, Error **errp); +bool vfio_pci_config_setup(VFIOPCIDevice *vdev, Error **errp); +bool vfio_pci_interrupt_setup(VFIOPCIDevice *vdev, Error **errp); +void vfio_pci_intx_eoi(VFIODevice *vbasedev); +void vfio_pci_put_device(VFIOPCIDevice *vdev); +bool vfio_pci_populate_device(VFIOPCIDevice *vdev, Error **errp); +void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev); +void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev); +void vfio_pci_teardown_msi(VFIOPCIDevice *vdev); + #endif /* HW_VFIO_VFIO_PCI_H */ diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index e90ec9bff8..f06236f37b 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -2,7 +2,7 @@ # pci.c vfio_intx_interrupt(const char *name, char line) " (%s) Pin %c" -vfio_intx_eoi(const char *name) " (%s) EOI" +vfio_pci_intx_eoi(const char *name) " (%s) EOI" vfio_intx_enable_kvm(const char *name) " (%s) KVM INTx accel enabled" vfio_intx_disable_kvm(const char *name) " (%s) KVM INTx accel disabled" vfio_intx_update(const char *name, int new_irq, int target_irq) " (%s) IRQ moved %d -> %d" @@ -35,8 +35,8 @@ vfio_pci_hot_reset(const char *name, const char *type) " (%s) %s" vfio_pci_hot_reset_has_dep_devices(const char *name) "%s: hot reset dependent devices:" vfio_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int group_id) "\t%04x:%02x:%02x.%x group %d" vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %s" -vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device '%s' config: size: 0x%lx, offset: 0x%lx, flags: 0x%lx" -vfio_populate_device_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s" +vfio_pci_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device '%s' config: size: 0x%lx, offset: 0x%lx, flags: 0x%lx" +vfio_pci_populate_device_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s" vfio_mdev(const char *name, bool is_mdev) " (%s) is_mdev %d" vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s 0x%x@0x%x" vfio_pci_reset(const char *name) " (%s)" From 59adfc6f1843538d78373296fd05a57ced1f3ecb Mon Sep 17 00:00:00 2001 From: John Levon Date: Fri, 6 Jun 2025 17:10:35 -0700 Subject: [PATCH 08/27] vfio: add per-region fd support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For vfio-user, each region has its own fd rather than sharing vbasedev's. Add the necessary plumbing to support this, and use the correct fd in vfio_region_mmap(). Signed-off-by: John Levon Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250607001056.335310-4-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio/device.c | 29 +++++++++++++++++++++++++---- hw/vfio/region.c | 9 +++++++-- include/hw/vfio/vfio-device.h | 7 +++++-- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/hw/vfio/device.c b/hw/vfio/device.c index 9fba2c7272..a4bdde8e8b 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -200,6 +200,7 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info) { size_t argsz = sizeof(struct vfio_region_info); + int fd = -1; int ret; /* check cache */ @@ -214,7 +215,7 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index, retry: (*info)->argsz = argsz; - ret = vbasedev->io_ops->get_region_info(vbasedev, *info); + ret = vbasedev->io_ops->get_region_info(vbasedev, *info, &fd); if (ret != 0) { g_free(*info); *info = NULL; @@ -225,11 +226,19 @@ retry: argsz = (*info)->argsz; *info = g_realloc(*info, argsz); + if (fd != -1) { + close(fd); + fd = -1; + } + goto retry; } /* fill cache */ vbasedev->reginfo[index] = *info; + if (vbasedev->region_fds != NULL) { + vbasedev->region_fds[index] = fd; + } return 0; } @@ -334,6 +343,7 @@ void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, vbasedev->io_ops = &vfio_device_io_ops_ioctl; vbasedev->dev = dev; vbasedev->fd = -1; + vbasedev->use_region_fds = false; vbasedev->ram_block_discard_allowed = ram_discard; } @@ -444,6 +454,9 @@ void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, vbasedev->reginfo = g_new0(struct vfio_region_info *, vbasedev->num_regions); + if (vbasedev->use_region_fds) { + vbasedev->region_fds = g_new0(int, vbasedev->num_regions); + } } void vfio_device_unprepare(VFIODevice *vbasedev) @@ -452,9 +465,14 @@ void vfio_device_unprepare(VFIODevice *vbasedev) for (i = 0; i < vbasedev->num_regions; i++) { g_free(vbasedev->reginfo[i]); + if (vbasedev->region_fds != NULL && vbasedev->region_fds[i] != -1) { + close(vbasedev->region_fds[i]); + } + } - g_free(vbasedev->reginfo); - vbasedev->reginfo = NULL; + + g_clear_pointer(&vbasedev->reginfo, g_free); + g_clear_pointer(&vbasedev->region_fds, g_free); QLIST_REMOVE(vbasedev, container_next); QLIST_REMOVE(vbasedev, global_next); @@ -476,10 +494,13 @@ static int vfio_device_io_device_feature(VFIODevice *vbasedev, } static int vfio_device_io_get_region_info(VFIODevice *vbasedev, - struct vfio_region_info *info) + struct vfio_region_info *info, + int *fd) { int ret; + *fd = -1; + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info); return ret < 0 ? -errno : ret; diff --git a/hw/vfio/region.c b/hw/vfio/region.c index 34752c3f65..cb172f2136 100644 --- a/hw/vfio/region.c +++ b/hw/vfio/region.c @@ -241,6 +241,7 @@ int vfio_region_mmap(VFIORegion *region) { int i, ret, prot = 0; char *name; + int fd; if (!region->mem) { return 0; @@ -271,14 +272,18 @@ int vfio_region_mmap(VFIORegion *region) goto no_mmap; } + /* Use the per-region fd if set, or the shared fd. */ + fd = region->vbasedev->region_fds ? + region->vbasedev->region_fds[region->nr] : + region->vbasedev->fd, + map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align); munmap(map_base, map_align - map_base); munmap(map_align + region->mmaps[i].size, align - (map_align - map_base)); region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot, - MAP_SHARED | MAP_FIXED, - region->vbasedev->fd, + MAP_SHARED | MAP_FIXED, fd, region->fd_offset + region->mmaps[i].offset); if (region->mmaps[i].mmap == MAP_FAILED) { diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index 8bcb3c19f6..bf54fc6920 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -66,6 +66,7 @@ typedef struct VFIODevice { OnOffAuto enable_migration; OnOffAuto migration_multifd_transfer; bool migration_events; + bool use_region_fds; VFIODeviceOps *ops; VFIODeviceIOOps *io_ops; unsigned int num_irqs; @@ -84,6 +85,7 @@ typedef struct VFIODevice { VFIOIOASHwpt *hwpt; QLIST_ENTRY(VFIODevice) hwpt_next; struct vfio_region_info **reginfo; + int *region_fds; } VFIODevice; struct VFIODeviceOps { @@ -170,10 +172,11 @@ struct VFIODeviceIOOps { /** * @get_region_info * - * Fill in @info with information on the region given by @info->index. + * Fill in @info (and optionally @fd) with information on the region given + * by @info->index. */ int (*get_region_info)(VFIODevice *vdev, - struct vfio_region_info *info); + struct vfio_region_info *info, int *fd); /** * @get_irq_info From a574b0614449760a83310226b548e21b08a805fa Mon Sep 17 00:00:00 2001 From: John Levon Date: Fri, 6 Jun 2025 17:10:36 -0700 Subject: [PATCH 09/27] vfio: mark posted writes in region write callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For vfio-user, the region write implementation needs to know if the write is posted; add the necessary plumbing to support this. Signed-off-by: John Levon Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250607001056.335310-5-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio/device.c | 3 ++- hw/vfio/pci.c | 5 ++++- hw/vfio/region.c | 3 ++- include/hw/vfio/vfio-device.h | 4 ++-- include/hw/vfio/vfio-region.h | 1 + 5 files changed, 11 insertions(+), 5 deletions(-) diff --git a/hw/vfio/device.c b/hw/vfio/device.c index a4bdde8e8b..d32600eda1 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -543,7 +543,8 @@ static int vfio_device_io_region_read(VFIODevice *vbasedev, uint8_t index, } static int vfio_device_io_region_write(VFIODevice *vbasedev, uint8_t index, - off_t off, uint32_t size, void *data) + off_t off, uint32_t size, void *data, + bool post) { struct vfio_region_info *info; int ret; diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 2901cedf6f..89f9246416 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -989,7 +989,7 @@ static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset, { return vdev->vbasedev.io_ops->region_write(&vdev->vbasedev, VFIO_PCI_CONFIG_REGION_INDEX, - offset, size, data); + offset, size, data, false); } static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) @@ -1793,6 +1793,9 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) bar->type = pci_bar & (bar->ioport ? ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK); bar->size = bar->region.size; + + /* IO regions are sync, memory can be async */ + bar->region.post_wr = (bar->ioport == 0); } static void vfio_bars_prepare(VFIOPCIDevice *vdev) diff --git a/hw/vfio/region.c b/hw/vfio/region.c index cb172f2136..f5b8e3cbf1 100644 --- a/hw/vfio/region.c +++ b/hw/vfio/region.c @@ -66,7 +66,7 @@ void vfio_region_write(void *opaque, hwaddr addr, } ret = vbasedev->io_ops->region_write(vbasedev, region->nr, - addr, size, &buf); + addr, size, &buf, region->post_wr); if (ret != size) { error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 ",%d) failed: %s", @@ -200,6 +200,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, region->size = info->size; region->fd_offset = info->offset; region->nr = index; + region->post_wr = false; if (region->size) { region->mem = g_new0(MemoryRegion, 1); diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index bf54fc6920..9793b2dba0 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -205,10 +205,10 @@ struct VFIODeviceIOOps { * @region_write * * Write @size bytes to the region @nr at offset @off from the buffer - * @data. + * @data; if @post, the write is posted. */ int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size, - void *data); + void *data, bool post); }; void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, diff --git a/include/hw/vfio/vfio-region.h b/include/hw/vfio/vfio-region.h index cbffb26962..ede6e0c8f9 100644 --- a/include/hw/vfio/vfio-region.h +++ b/include/hw/vfio/vfio-region.h @@ -29,6 +29,7 @@ typedef struct VFIORegion { uint32_t nr_mmaps; VFIOMmap *mmaps; uint8_t nr; /* cache the region number for debug */ + bool post_wr; /* writes can be posted */ } VFIORegion; From f95fd60ac164a75c586ce29ebcc94bb6df85b52a Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 10 Jun 2025 08:39:14 -0700 Subject: [PATCH 10/27] migration: cpr helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the cpr_incoming_needed, cpr_open_fd, and cpr_resave_fd helpers, for use when adding cpr support for vfio and iommufd. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/1749569991-25171-2-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- include/migration/cpr.h | 5 +++++ migration/cpr.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/include/migration/cpr.h b/include/migration/cpr.h index 7561fc75ad..07858e93fa 100644 --- a/include/migration/cpr.h +++ b/include/migration/cpr.h @@ -18,6 +18,9 @@ void cpr_save_fd(const char *name, int id, int fd); void cpr_delete_fd(const char *name, int id); int cpr_find_fd(const char *name, int id); +void cpr_resave_fd(const char *name, int id, int fd); +int cpr_open_fd(const char *path, int flags, const char *name, int id, + Error **errp); MigMode cpr_get_incoming_mode(void); void cpr_set_incoming_mode(MigMode mode); @@ -28,6 +31,8 @@ int cpr_state_load(MigrationChannel *channel, Error **errp); void cpr_state_close(void); struct QIOChannel *cpr_state_ioc(void); +bool cpr_incoming_needed(void *opaque); + QEMUFile *cpr_transfer_output(MigrationChannel *channel, Error **errp); QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp); diff --git a/migration/cpr.c b/migration/cpr.c index 42c46563e5..a50a57edca 100644 --- a/migration/cpr.c +++ b/migration/cpr.c @@ -95,6 +95,36 @@ int cpr_find_fd(const char *name, int id) trace_cpr_find_fd(name, id, fd); return fd; } + +void cpr_resave_fd(const char *name, int id, int fd) +{ + CprFd *elem = find_fd(&cpr_state.fds, name, id); + int old_fd = elem ? elem->fd : -1; + + if (old_fd < 0) { + cpr_save_fd(name, id, fd); + } else if (old_fd != fd) { + error_setg(&error_fatal, + "internal error: cpr fd '%s' id %d value %d " + "already saved with a different value %d", + name, id, fd, old_fd); + } +} + +int cpr_open_fd(const char *path, int flags, const char *name, int id, + Error **errp) +{ + int fd = cpr_find_fd(name, id); + + if (fd < 0) { + fd = qemu_open(path, flags, errp); + if (fd >= 0) { + cpr_save_fd(name, id, fd); + } + } + return fd; +} + /*************************************************************************/ #define CPR_STATE "CprState" @@ -228,3 +258,9 @@ void cpr_state_close(void) cpr_state_file = NULL; } } + +bool cpr_incoming_needed(void *opaque) +{ + MigMode mode = migrate_mode(); + return mode == MIG_MODE_CPR_TRANSFER; +} From 081c09dc52bab0facfdd1d34fd466479af6ac531 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 10 Jun 2025 08:39:15 -0700 Subject: [PATCH 11/27] migration: lower handler priority MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define a vmstate priority that is lower than the default, so its handlers run after all default priority handlers. Since 0 is no longer the default priority, translate an uninitialized priority of 0 to MIG_PRI_DEFAULT. CPR for vfio will use this to install handlers for containers that run after handlers for the devices that they contain. Signed-off-by: Steve Sistare Reviewed-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/qemu-devel/1749569991-25171-3-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- include/migration/vmstate.h | 6 +++++- migration/savevm.c | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h index a1dfab4460..1ff7bd9ac4 100644 --- a/include/migration/vmstate.h +++ b/include/migration/vmstate.h @@ -155,7 +155,11 @@ enum VMStateFlags { }; typedef enum { - MIG_PRI_DEFAULT = 0, + MIG_PRI_UNINITIALIZED = 0, /* An uninitialized priority field maps to */ + /* MIG_PRI_DEFAULT in save_state_priority */ + + MIG_PRI_LOW, /* Must happen after default */ + MIG_PRI_DEFAULT, MIG_PRI_IOMMU, /* Must happen before PCI devices */ MIG_PRI_PCI_BUS, /* Must happen before IOMMU */ MIG_PRI_VIRTIO_MEM, /* Must happen before IOMMU */ diff --git a/migration/savevm.c b/migration/savevm.c index 52105dd2f1..bb04a4520d 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -266,7 +266,7 @@ typedef struct SaveState { static SaveState savevm_state = { .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers), - .handler_pri_head = { [MIG_PRI_DEFAULT ... MIG_PRI_MAX] = NULL }, + .handler_pri_head = { [0 ... MIG_PRI_MAX] = NULL }, .global_section_id = 0, }; @@ -737,7 +737,7 @@ static int calculate_compat_instance_id(const char *idstr) static inline MigrationPriority save_state_priority(SaveStateEntry *se) { - if (se->vmsd) { + if (se->vmsd && se->vmsd->priority) { return se->vmsd->priority; } return MIG_PRI_DEFAULT; From 54857b08168a0a74711d1f773c16a7122499027b Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 10 Jun 2025 08:39:16 -0700 Subject: [PATCH 12/27] vfio/container: register container for cpr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Register a legacy container for cpr-transfer, replacing the generic CPR register call with a more specific legacy container register call. Add a blocker if the kernel does not support VFIO_UPDATE_VADDR or VFIO_UNMAP_ALL. This is mostly boiler plate. The fields to to saved and restored are added in subsequent patches. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/1749569991-25171-4-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 7 ++-- hw/vfio/cpr-legacy.c | 68 ++++++++++++++++++++++++++++++++ hw/vfio/cpr.c | 5 +-- hw/vfio/meson.build | 1 + include/hw/vfio/vfio-container.h | 2 + include/hw/vfio/vfio-cpr.h | 15 +++++++ 6 files changed, 91 insertions(+), 7 deletions(-) create mode 100644 hw/vfio/cpr-legacy.c diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 0f948d0247..93cdf80e13 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -33,7 +33,6 @@ #include "qapi/error.h" #include "pci.h" #include "hw/vfio/vfio-container.h" -#include "hw/vfio/vfio-cpr.h" #include "vfio-helpers.h" #include "vfio-listener.h" @@ -643,7 +642,7 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, new_container = true; bcontainer = &container->bcontainer; - if (!vfio_cpr_register_container(bcontainer, errp)) { + if (!vfio_legacy_cpr_register_container(container, errp)) { goto fail; } @@ -679,7 +678,7 @@ fail: vioc->release(bcontainer); } if (new_container) { - vfio_cpr_unregister_container(bcontainer); + vfio_legacy_cpr_unregister_container(container); object_unref(container); } if (fd >= 0) { @@ -720,7 +719,7 @@ static void vfio_container_disconnect(VFIOGroup *group) VFIOAddressSpace *space = bcontainer->space; trace_vfio_container_disconnect(container->fd); - vfio_cpr_unregister_container(bcontainer); + vfio_legacy_cpr_unregister_container(container); close(container->fd); object_unref(container); diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c new file mode 100644 index 0000000000..dd7ac84074 --- /dev/null +++ b/hw/vfio/cpr-legacy.c @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2021-2025 Oracle and/or its affiliates. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include +#include +#include "qemu/osdep.h" +#include "hw/vfio/vfio-container.h" +#include "migration/blocker.h" +#include "migration/cpr.h" +#include "migration/migration.h" +#include "migration/vmstate.h" +#include "qapi/error.h" + +static bool vfio_cpr_supported(VFIOContainer *container, Error **errp) +{ + if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UPDATE_VADDR)) { + error_setg(errp, "VFIO container does not support VFIO_UPDATE_VADDR"); + return false; + + } else if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UNMAP_ALL)) { + error_setg(errp, "VFIO container does not support VFIO_UNMAP_ALL"); + return false; + + } else { + return true; + } +} + +static const VMStateDescription vfio_container_vmstate = { + .name = "vfio-container", + .version_id = 0, + .minimum_version_id = 0, + .needed = cpr_incoming_needed, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + } +}; + +bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + Error **cpr_blocker = &container->cpr.blocker; + + migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier, + vfio_cpr_reboot_notifier, + MIG_MODE_CPR_REBOOT); + + if (!vfio_cpr_supported(container, cpr_blocker)) { + return migrate_add_blocker_modes(cpr_blocker, errp, + MIG_MODE_CPR_TRANSFER, -1) == 0; + } + + vmstate_register(NULL, -1, &vfio_container_vmstate, container); + + return true; +} + +void vfio_legacy_cpr_unregister_container(VFIOContainer *container) +{ + VFIOContainerBase *bcontainer = &container->bcontainer; + + migration_remove_notifier(&bcontainer->cpr_reboot_notifier); + migrate_del_blocker(&container->cpr.blocker); + vmstate_unregister(NULL, &vfio_container_vmstate, container); +} diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c index 0210e76a10..0e59612228 100644 --- a/hw/vfio/cpr.c +++ b/hw/vfio/cpr.c @@ -7,13 +7,12 @@ #include "qemu/osdep.h" #include "hw/vfio/vfio-device.h" -#include "migration/misc.h" #include "hw/vfio/vfio-cpr.h" #include "qapi/error.h" #include "system/runstate.h" -static int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, - MigrationEvent *e, Error **errp) +int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, + MigrationEvent *e, Error **errp) { if (e->type == MIG_EVENT_PRECOPY_SETUP && !runstate_check(RUN_STATE_SUSPENDED) && !vm_get_suspended()) { diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index bccb05098c..73d29f925f 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -21,6 +21,7 @@ system_ss.add(when: 'CONFIG_VFIO_XGMAC', if_true: files('calxeda-xgmac.c')) system_ss.add(when: 'CONFIG_VFIO_AMD_XGBE', if_true: files('amd-xgbe.c')) system_ss.add(when: 'CONFIG_VFIO', if_true: files( 'cpr.c', + 'cpr-legacy.c', 'device.c', 'migration.c', 'migration-multifd.c', diff --git a/include/hw/vfio/vfio-container.h b/include/hw/vfio/vfio-container.h index afc498da49..21e5807e48 100644 --- a/include/hw/vfio/vfio-container.h +++ b/include/hw/vfio/vfio-container.h @@ -10,6 +10,7 @@ #define HW_VFIO_CONTAINER_H #include "hw/vfio/vfio-container-base.h" +#include "hw/vfio/vfio-cpr.h" typedef struct VFIOContainer VFIOContainer; typedef struct VFIODevice VFIODevice; @@ -29,6 +30,7 @@ typedef struct VFIOContainer { int fd; /* /dev/vfio/vfio, empowered by the attached groups */ unsigned iommu_type; QLIST_HEAD(, VFIOGroup) group_list; + VFIOContainerCPR cpr; } VFIOContainer; OBJECT_DECLARE_SIMPLE_TYPE(VFIOContainer, VFIO_IOMMU_LEGACY); diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h index 750ea5bcea..d4e0bd53c9 100644 --- a/include/hw/vfio/vfio-cpr.h +++ b/include/hw/vfio/vfio-cpr.h @@ -9,8 +9,23 @@ #ifndef HW_VFIO_VFIO_CPR_H #define HW_VFIO_VFIO_CPR_H +#include "migration/misc.h" + +struct VFIOContainer; struct VFIOContainerBase; +typedef struct VFIOContainerCPR { + Error *blocker; +} VFIOContainerCPR; + + +bool vfio_legacy_cpr_register_container(struct VFIOContainer *container, + Error **errp); +void vfio_legacy_cpr_unregister_container(struct VFIOContainer *container); + +int vfio_cpr_reboot_notifier(NotifierWithReturn *notifier, MigrationEvent *e, + Error **errp); + bool vfio_cpr_register_container(struct VFIOContainerBase *bcontainer, Error **errp); void vfio_cpr_unregister_container(struct VFIOContainerBase *bcontainer); From c29a65ed68535288551d2bf70ae32f3df371a57d Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 10 Jun 2025 08:39:17 -0700 Subject: [PATCH 13/27] vfio/container: preserve descriptors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At vfio creation time, save the value of vfio container, group, and device descriptors in CPR state. On qemu restart, vfio_realize() finds and uses the saved descriptors. During reuse, device and iommu state is already configured, so operations in vfio_realize that would modify the configuration, such as vfio ioctl's, are skipped. The result is that vfio_realize constructs qemu data structures that reflect the current state of the device. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Reviewed-by: Zhenzhong Duan Link: https://lore.kernel.org/qemu-devel/1749569991-25171-5-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 67 +++++++++++++++++++++++++++++--------- hw/vfio/cpr-legacy.c | 42 ++++++++++++++++++++++++ include/hw/vfio/vfio-cpr.h | 6 ++++ 3 files changed, 100 insertions(+), 15 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 93cdf80e13..5caae4ccae 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -31,6 +31,8 @@ #include "system/reset.h" #include "trace.h" #include "qapi/error.h" +#include "migration/cpr.h" +#include "migration/blocker.h" #include "pci.h" #include "hw/vfio/vfio-container.h" #include "vfio-helpers.h" @@ -425,7 +427,12 @@ static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group, return NULL; } - if (!vfio_set_iommu(fd, group->fd, &iommu_type, errp)) { + /* + * During CPR, just set the container type and skip the ioctls, as the + * container and group are already configured in the kernel. + */ + if (!cpr_is_incoming() && + !vfio_set_iommu(fd, group->fd, &iommu_type, errp)) { return NULL; } @@ -592,6 +599,11 @@ static bool vfio_container_group_add(VFIOContainer *container, VFIOGroup *group, group->container = container; QLIST_INSERT_HEAD(&container->group_list, group, container_next); vfio_group_add_kvm_device(group); + /* + * Remember the container fd for each group, so we can attach to the same + * container after CPR. + */ + cpr_resave_fd("vfio_container_for_group", group->groupid, container->fd); return true; } @@ -601,6 +613,7 @@ static void vfio_container_group_del(VFIOContainer *container, VFIOGroup *group) group->container = NULL; vfio_group_del_kvm_device(group); vfio_ram_block_discard_disable(container, false); + cpr_delete_fd("vfio_container_for_group", group->groupid); } static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, @@ -615,17 +628,34 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, bool group_was_added = false; space = vfio_address_space_get(as); + fd = cpr_find_fd("vfio_container_for_group", group->groupid); - QLIST_FOREACH(bcontainer, &space->containers, next) { - container = container_of(bcontainer, VFIOContainer, bcontainer); - if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { - return vfio_container_group_add(container, group, errp); + if (!cpr_is_incoming()) { + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = container_of(bcontainer, VFIOContainer, bcontainer); + if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { + return vfio_container_group_add(container, group, errp); + } } - } - fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp); - if (fd < 0) { - goto fail; + fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp); + if (fd < 0) { + goto fail; + } + } else { + /* + * For incoming CPR, the group is already attached in the kernel. + * If a container with matching fd is found, then update the + * userland group list and return. If not, then after the loop, + * create the container struct and group list. + */ + QLIST_FOREACH(bcontainer, &space->containers, next) { + container = container_of(bcontainer, VFIOContainer, bcontainer); + + if (vfio_cpr_container_match(container, group, fd)) { + return vfio_container_group_add(container, group, errp); + } + } } ret = ioctl(fd, VFIO_GET_API_VERSION); @@ -697,6 +727,7 @@ static void vfio_container_disconnect(VFIOGroup *group) QLIST_REMOVE(group, container_next); group->container = NULL; + cpr_delete_fd("vfio_container_for_group", group->groupid); /* * Explicitly release the listener first before unset container, @@ -750,7 +781,7 @@ static VFIOGroup *vfio_group_get(int groupid, AddressSpace *as, Error **errp) group = g_malloc0(sizeof(*group)); snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); - group->fd = qemu_open(path, O_RDWR, errp); + group->fd = cpr_open_fd(path, O_RDWR, "vfio_group", groupid, errp); if (group->fd < 0) { goto free_group_exit; } @@ -782,6 +813,7 @@ static VFIOGroup *vfio_group_get(int groupid, AddressSpace *as, Error **errp) return group; close_fd_exit: + cpr_delete_fd("vfio_group", groupid); close(group->fd); free_group_exit: @@ -803,6 +835,7 @@ static void vfio_group_put(VFIOGroup *group) vfio_container_disconnect(group); QLIST_REMOVE(group, next); trace_vfio_group_put(group->fd); + cpr_delete_fd("vfio_group", group->groupid); close(group->fd); g_free(group); } @@ -813,7 +846,7 @@ static bool vfio_device_get(VFIOGroup *group, const char *name, g_autofree struct vfio_device_info *info = NULL; int fd; - fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); + fd = vfio_cpr_group_get_device_fd(group->fd, name); if (fd < 0) { error_setg_errno(errp, errno, "error getting device from group %d", group->groupid); @@ -826,8 +859,7 @@ static bool vfio_device_get(VFIOGroup *group, const char *name, info = vfio_get_device_info(fd); if (!info) { error_setg_errno(errp, errno, "error getting device info"); - close(fd); - return false; + goto fail; } /* @@ -841,8 +873,7 @@ static bool vfio_device_get(VFIOGroup *group, const char *name, if (!QLIST_EMPTY(&group->device_list)) { error_setg(errp, "Inconsistent setting of support for discarding " "RAM (e.g., balloon) within group"); - close(fd); - return false; + goto fail; } if (!group->ram_block_discard_allowed) { @@ -860,6 +891,11 @@ static bool vfio_device_get(VFIOGroup *group, const char *name, trace_vfio_device_get(name, info->flags, info->num_regions, info->num_irqs); return true; + +fail: + close(fd); + cpr_delete_fd(name, 0); + return false; } static void vfio_device_put(VFIODevice *vbasedev) @@ -870,6 +906,7 @@ static void vfio_device_put(VFIODevice *vbasedev) QLIST_REMOVE(vbasedev, next); vbasedev->group = NULL; trace_vfio_device_put(vbasedev->fd); + cpr_delete_fd(vbasedev->name, 0); close(vbasedev->fd); } diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c index dd7ac84074..ac4a9ab315 100644 --- a/hw/vfio/cpr-legacy.c +++ b/hw/vfio/cpr-legacy.c @@ -8,6 +8,7 @@ #include #include "qemu/osdep.h" #include "hw/vfio/vfio-container.h" +#include "hw/vfio/vfio-device.h" #include "migration/blocker.h" #include "migration/cpr.h" #include "migration/migration.h" @@ -66,3 +67,44 @@ void vfio_legacy_cpr_unregister_container(VFIOContainer *container) migrate_del_blocker(&container->cpr.blocker); vmstate_unregister(NULL, &vfio_container_vmstate, container); } + +int vfio_cpr_group_get_device_fd(int d, const char *name) +{ + const int id = 0; + int fd = cpr_find_fd(name, id); + + if (fd < 0) { + fd = ioctl(d, VFIO_GROUP_GET_DEVICE_FD, name); + if (fd >= 0) { + cpr_save_fd(name, id, fd); + } + } + return fd; +} + +static bool same_device(int fd1, int fd2) +{ + struct stat st1, st2; + + return !fstat(fd1, &st1) && !fstat(fd2, &st2) && st1.st_dev == st2.st_dev; +} + +bool vfio_cpr_container_match(VFIOContainer *container, VFIOGroup *group, + int fd) +{ + if (container->fd == fd) { + return true; + } + if (!same_device(container->fd, fd)) { + return false; + } + /* + * Same device, different fd. This occurs when the container fd is + * cpr_save'd multiple times, once for each groupid, so SCM_RIGHTS + * produces duplicates. De-dup it. + */ + cpr_delete_fd("vfio_container_for_group", group->groupid); + close(fd); + cpr_save_fd("vfio_container_for_group", group->groupid, container->fd); + return true; +} diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h index d4e0bd53c9..5a2e5f6b21 100644 --- a/include/hw/vfio/vfio-cpr.h +++ b/include/hw/vfio/vfio-cpr.h @@ -13,6 +13,7 @@ struct VFIOContainer; struct VFIOContainerBase; +struct VFIOGroup; typedef struct VFIOContainerCPR { Error *blocker; @@ -30,4 +31,9 @@ bool vfio_cpr_register_container(struct VFIOContainerBase *bcontainer, Error **errp); void vfio_cpr_unregister_container(struct VFIOContainerBase *bcontainer); +int vfio_cpr_group_get_device_fd(int d, const char *name); + +bool vfio_cpr_container_match(struct VFIOContainer *container, + struct VFIOGroup *group, int fd); + #endif /* HW_VFIO_VFIO_CPR_H */ From 1faadd96306b428f2e8e35f71761c333a6777f55 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 10 Jun 2025 08:39:18 -0700 Subject: [PATCH 14/27] vfio/container: discard old DMA vaddr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the container pre_save handler, discard the virtual addresses in DMA mappings with VFIO_DMA_UNMAP_FLAG_VADDR, because guest RAM will be remapped at a different VA after in new QEMU. DMA to already-mapped pages continues. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/1749569991-25171-6-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/cpr-legacy.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c index ac4a9ab315..ef106d0ea8 100644 --- a/hw/vfio/cpr-legacy.c +++ b/hw/vfio/cpr-legacy.c @@ -15,6 +15,22 @@ #include "migration/vmstate.h" #include "qapi/error.h" +static bool vfio_dma_unmap_vaddr_all(VFIOContainer *container, Error **errp) +{ + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = VFIO_DMA_UNMAP_FLAG_VADDR | VFIO_DMA_UNMAP_FLAG_ALL, + .iova = 0, + .size = 0, + }; + if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + error_setg_errno(errp, errno, "vfio_dma_unmap_vaddr_all"); + return false; + } + return true; +} + + static bool vfio_cpr_supported(VFIOContainer *container, Error **errp) { if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UPDATE_VADDR)) { @@ -30,10 +46,23 @@ static bool vfio_cpr_supported(VFIOContainer *container, Error **errp) } } +static int vfio_container_pre_save(void *opaque) +{ + VFIOContainer *container = opaque; + Error *local_err = NULL; + + if (!vfio_dma_unmap_vaddr_all(container, &local_err)) { + error_report_err(local_err); + return -1; + } + return 0; +} + static const VMStateDescription vfio_container_vmstate = { .name = "vfio-container", .version_id = 0, .minimum_version_id = 0, + .pre_save = vfio_container_pre_save, .needed = cpr_incoming_needed, .fields = (VMStateField[]) { VMSTATE_END_OF_LIST() From 7e9f21411302d823e9ee563ff6a2a40b1ffc1266 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 10 Jun 2025 08:39:19 -0700 Subject: [PATCH 15/27] vfio/container: restore DMA vaddr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In new QEMU, do not register the memory listener at device creation time. Register it later, in the container post_load handler, after all vmstate that may affect regions and mapping boundaries has been loaded. The post_load registration will cause the listener to invoke its callback on each flat section, and the calls will match the mappings remembered by the kernel. The listener calls a special dma_map handler that passes the new VA of each section to the kernel using VFIO_DMA_MAP_FLAG_VADDR. Restore the normal handler at the end. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/1749569991-25171-7-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 15 ++++++++-- hw/vfio/cpr-legacy.c | 57 ++++++++++++++++++++++++++++++++++++++ include/hw/vfio/vfio-cpr.h | 3 ++ 3 files changed, 73 insertions(+), 2 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 5caae4ccae..936ce37f19 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -136,6 +136,8 @@ static int vfio_legacy_dma_unmap_one(const VFIOContainerBase *bcontainer, int ret; Error *local_err = NULL; + g_assert(!cpr_is_incoming()); + if (iotlb && vfio_container_dirty_tracking_is_started(bcontainer)) { if (!vfio_container_devices_dirty_tracking_is_supported(bcontainer) && bcontainer->dirty_pages_supported) { @@ -690,8 +692,17 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, } group_was_added = true; - if (!vfio_listener_register(bcontainer, errp)) { - goto fail; + /* + * If CPR, register the listener later, after all state that may + * affect regions and mapping boundaries has been cpr load'ed. Later, + * the listener will invoke its callback on each flat section and call + * dma_map to supply the new vaddr, and the calls will match the mappings + * remembered by the kernel. + */ + if (!cpr_is_incoming()) { + if (!vfio_listener_register(bcontainer, errp)) { + goto fail; + } } bcontainer->initialized = true; diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c index ef106d0ea8..2fd8348c7c 100644 --- a/hw/vfio/cpr-legacy.c +++ b/hw/vfio/cpr-legacy.c @@ -9,11 +9,13 @@ #include "qemu/osdep.h" #include "hw/vfio/vfio-container.h" #include "hw/vfio/vfio-device.h" +#include "hw/vfio/vfio-listener.h" #include "migration/blocker.h" #include "migration/cpr.h" #include "migration/migration.h" #include "migration/vmstate.h" #include "qapi/error.h" +#include "qemu/error-report.h" static bool vfio_dma_unmap_vaddr_all(VFIOContainer *container, Error **errp) { @@ -30,6 +32,32 @@ static bool vfio_dma_unmap_vaddr_all(VFIOContainer *container, Error **errp) return true; } +/* + * Set the new @vaddr for any mappings registered during cpr load. + * The incoming state is cleared thereafter. + */ +static int vfio_legacy_cpr_dma_map(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, void *vaddr, + bool readonly, MemoryRegion *mr) +{ + const VFIOContainer *container = container_of(bcontainer, VFIOContainer, + bcontainer); + struct vfio_iommu_type1_dma_map map = { + .argsz = sizeof(map), + .flags = VFIO_DMA_MAP_FLAG_VADDR, + .vaddr = (__u64)(uintptr_t)vaddr, + .iova = iova, + .size = size, + }; + + g_assert(cpr_is_incoming()); + + if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) { + return -errno; + } + + return 0; +} static bool vfio_cpr_supported(VFIOContainer *container, Error **errp) { @@ -58,11 +86,34 @@ static int vfio_container_pre_save(void *opaque) return 0; } +static int vfio_container_post_load(void *opaque, int version_id) +{ + VFIOContainer *container = opaque; + VFIOContainerBase *bcontainer = &container->bcontainer; + VFIOGroup *group; + Error *local_err = NULL; + + if (!vfio_listener_register(bcontainer, &local_err)) { + error_report_err(local_err); + return -1; + } + + QLIST_FOREACH(group, &container->group_list, container_next) { + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + + /* Restore original dma_map function */ + vioc->dma_map = container->cpr.saved_dma_map; + } + return 0; +} + static const VMStateDescription vfio_container_vmstate = { .name = "vfio-container", .version_id = 0, .minimum_version_id = 0, + .priority = MIG_PRI_LOW, /* Must happen after devices and groups */ .pre_save = vfio_container_pre_save, + .post_load = vfio_container_post_load, .needed = cpr_incoming_needed, .fields = (VMStateField[]) { VMSTATE_END_OF_LIST() @@ -85,6 +136,12 @@ bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp) vmstate_register(NULL, -1, &vfio_container_vmstate, container); + /* During incoming CPR, divert calls to dma_map. */ + if (cpr_is_incoming()) { + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + container->cpr.saved_dma_map = vioc->dma_map; + vioc->dma_map = vfio_legacy_cpr_dma_map; + } return true; } diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h index 5a2e5f6b21..04624475f8 100644 --- a/include/hw/vfio/vfio-cpr.h +++ b/include/hw/vfio/vfio-cpr.h @@ -17,6 +17,9 @@ struct VFIOGroup; typedef struct VFIOContainerCPR { Error *blocker; + int (*saved_dma_map)(const struct VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + void *vaddr, bool readonly, MemoryRegion *mr); } VFIOContainerCPR; From dac0dd68d9b150a6aa334ab8ee9aeba011d54b32 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 10 Jun 2025 08:39:20 -0700 Subject: [PATCH 16/27] vfio/container: mdev cpr blocker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During CPR, after VFIO_DMA_UNMAP_FLAG_VADDR, the vaddr is temporarily invalid, so mediated devices cannot be supported. Add a blocker for them. This restriction will not apply to iommufd containers when CPR is added for them in a future patch. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/1749569991-25171-8-git-send-email-steven.sistare@oracle.com [ clg: Fixed context change in VFIODevice ] Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 8 ++++++++ include/hw/vfio/vfio-cpr.h | 3 +++ include/hw/vfio/vfio-device.h | 2 ++ 3 files changed, 13 insertions(+) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 936ce37f19..3e8d645ebb 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -987,6 +987,13 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, goto device_put_exit; } + if (vbasedev->mdev) { + error_setg(&vbasedev->cpr.mdev_blocker, + "CPR does not support vfio mdev %s", vbasedev->name); + migrate_add_blocker_modes(&vbasedev->cpr.mdev_blocker, &error_fatal, + MIG_MODE_CPR_TRANSFER, -1); + } + return true; device_put_exit: @@ -1004,6 +1011,7 @@ static void vfio_legacy_detach_device(VFIODevice *vbasedev) vfio_device_unprepare(vbasedev); + migrate_del_blocker(&vbasedev->cpr.mdev_blocker); object_unref(vbasedev->hiod); vfio_device_put(vbasedev); vfio_group_put(group); diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h index 04624475f8..b83dd42751 100644 --- a/include/hw/vfio/vfio-cpr.h +++ b/include/hw/vfio/vfio-cpr.h @@ -22,6 +22,9 @@ typedef struct VFIOContainerCPR { void *vaddr, bool readonly, MemoryRegion *mr); } VFIOContainerCPR; +typedef struct VFIODeviceCPR { + Error *mdev_blocker; +} VFIODeviceCPR; bool vfio_legacy_cpr_register_container(struct VFIOContainer *container, Error **errp); diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index 9793b2dba0..f39259406b 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -28,6 +28,7 @@ #endif #include "system/system.h" #include "hw/vfio/vfio-container-base.h" +#include "hw/vfio/vfio-cpr.h" #include "system/host_iommu_device.h" #include "system/iommufd.h" @@ -86,6 +87,7 @@ typedef struct VFIODevice { QLIST_ENTRY(VFIODevice) hwpt_next; struct vfio_region_info **reginfo; int *region_fds; + VFIODeviceCPR cpr; } VFIODevice; struct VFIODeviceOps { From eba1f657cbb1b525e2b069626de655da21084e3e Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 10 Jun 2025 08:39:21 -0700 Subject: [PATCH 17/27] vfio/container: recover from unmap-all-vaddr failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If there are multiple containers and unmap-all fails for some container, we need to remap vaddr for the other containers for which unmap-all succeeded. Recover by walking all address ranges of all containers to restore the vaddr for each. Do so by invoking the vfio listener callback, and passing a new "remap" flag that tells it to restore a mapping without re-allocating new userland data structures. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/1749569991-25171-9-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/cpr-legacy.c | 91 +++++++++++++++++++++++++++ hw/vfio/listener.c | 19 +++++- include/hw/vfio/vfio-container-base.h | 3 + include/hw/vfio/vfio-cpr.h | 10 +++ 4 files changed, 122 insertions(+), 1 deletion(-) diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c index 2fd8348c7c..a84c3247b7 100644 --- a/hw/vfio/cpr-legacy.c +++ b/hw/vfio/cpr-legacy.c @@ -29,6 +29,7 @@ static bool vfio_dma_unmap_vaddr_all(VFIOContainer *container, Error **errp) error_setg_errno(errp, errno, "vfio_dma_unmap_vaddr_all"); return false; } + container->cpr.vaddr_unmapped = true; return true; } @@ -59,6 +60,14 @@ static int vfio_legacy_cpr_dma_map(const VFIOContainerBase *bcontainer, return 0; } +static void vfio_region_remap(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, + cpr.remap_listener); + vfio_container_region_add(&container->bcontainer, section, true); +} + static bool vfio_cpr_supported(VFIOContainer *container, Error **errp) { if (!ioctl(container->fd, VFIO_CHECK_EXTENSION, VFIO_UPDATE_VADDR)) { @@ -120,6 +129,40 @@ static const VMStateDescription vfio_container_vmstate = { } }; +static int vfio_cpr_fail_notifier(NotifierWithReturn *notifier, + MigrationEvent *e, Error **errp) +{ + VFIOContainer *container = + container_of(notifier, VFIOContainer, cpr.transfer_notifier); + VFIOContainerBase *bcontainer = &container->bcontainer; + + if (e->type != MIG_EVENT_PRECOPY_FAILED) { + return 0; + } + + if (container->cpr.vaddr_unmapped) { + /* + * Force a call to vfio_region_remap for each mapped section by + * temporarily registering a listener, and temporarily diverting + * dma_map to vfio_legacy_cpr_dma_map. The latter restores vaddr. + */ + + VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); + vioc->dma_map = vfio_legacy_cpr_dma_map; + + container->cpr.remap_listener = (MemoryListener) { + .name = "vfio cpr recover", + .region_add = vfio_region_remap + }; + memory_listener_register(&container->cpr.remap_listener, + bcontainer->space->as); + memory_listener_unregister(&container->cpr.remap_listener); + container->cpr.vaddr_unmapped = false; + vioc->dma_map = container->cpr.saved_dma_map; + } + return 0; +} + bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp) { VFIOContainerBase *bcontainer = &container->bcontainer; @@ -142,6 +185,10 @@ bool vfio_legacy_cpr_register_container(VFIOContainer *container, Error **errp) container->cpr.saved_dma_map = vioc->dma_map; vioc->dma_map = vfio_legacy_cpr_dma_map; } + + migration_add_notifier_mode(&container->cpr.transfer_notifier, + vfio_cpr_fail_notifier, + MIG_MODE_CPR_TRANSFER); return true; } @@ -152,6 +199,50 @@ void vfio_legacy_cpr_unregister_container(VFIOContainer *container) migration_remove_notifier(&bcontainer->cpr_reboot_notifier); migrate_del_blocker(&container->cpr.blocker); vmstate_unregister(NULL, &vfio_container_vmstate, container); + migration_remove_notifier(&container->cpr.transfer_notifier); +} + +/* + * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after + * succeeding for others, so the latter have lost their vaddr. Call this + * to restore vaddr for a section with a giommu. + * + * The giommu already exists. Find it and replay it, which calls + * vfio_legacy_cpr_dma_map further down the stack. + */ +void vfio_cpr_giommu_remap(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + VFIOGuestIOMMU *giommu = NULL; + hwaddr as_offset = section->offset_within_address_space; + hwaddr iommu_offset = as_offset - section->offset_within_region; + + QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { + if (giommu->iommu_mr == IOMMU_MEMORY_REGION(section->mr) && + giommu->iommu_offset == iommu_offset) { + break; + } + } + g_assert(giommu); + memory_region_iommu_replay(giommu->iommu_mr, &giommu->n); +} + +/* + * In old QEMU, VFIO_DMA_UNMAP_FLAG_VADDR may fail on some mapping after + * succeeding for others, so the latter have lost their vaddr. Call this + * to restore vaddr for a section with a RamDiscardManager. + * + * The ram discard listener already exists. Call its populate function + * directly, which calls vfio_legacy_cpr_dma_map. + */ +bool vfio_cpr_ram_discard_register_listener(VFIOContainerBase *bcontainer, + MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = + vfio_find_ram_discard_listener(bcontainer, section); + + g_assert(vrdl); + return vrdl->listener.notify_populate(&vrdl->listener, section) == 0; } int vfio_cpr_group_get_device_fd(int d, const char *name) diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c index 735b5f21b7..f498e23a93 100644 --- a/hw/vfio/listener.c +++ b/hw/vfio/listener.c @@ -481,6 +481,13 @@ static void vfio_listener_region_add(MemoryListener *listener, { VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, listener); + vfio_container_region_add(bcontainer, section, false); +} + +void vfio_container_region_add(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, + bool cpr_remap) +{ hwaddr iova, end; Int128 llend, llsize; void *vaddr; @@ -516,6 +523,11 @@ static void vfio_listener_region_add(MemoryListener *listener, int iommu_idx; trace_vfio_listener_region_add_iommu(section->mr->name, iova, end); + + if (cpr_remap) { + vfio_cpr_giommu_remap(bcontainer, section); + } + /* * FIXME: For VFIO iommu types which have KVM acceleration to * avoid bouncing all map/unmaps through qemu this way, this @@ -558,7 +570,12 @@ static void vfio_listener_region_add(MemoryListener *listener, * about changes. */ if (memory_region_has_ram_discard_manager(section->mr)) { - vfio_ram_discard_register_listener(bcontainer, section); + if (!cpr_remap) { + vfio_ram_discard_register_listener(bcontainer, section); + } else if (!vfio_cpr_ram_discard_register_listener(bcontainer, + section)) { + goto fail; + } return; } diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 9d37f86115..f0232654ee 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -256,4 +256,7 @@ struct VFIOIOMMUClass { VFIORamDiscardListener *vfio_find_ram_discard_listener( VFIOContainerBase *bcontainer, MemoryRegionSection *section); +void vfio_container_region_add(VFIOContainerBase *bcontainer, + MemoryRegionSection *section, bool cpr_remap); + #endif /* HW_VFIO_VFIO_CONTAINER_BASE_H */ diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h index b83dd42751..56ede049ad 100644 --- a/include/hw/vfio/vfio-cpr.h +++ b/include/hw/vfio/vfio-cpr.h @@ -10,6 +10,7 @@ #define HW_VFIO_VFIO_CPR_H #include "migration/misc.h" +#include "system/memory.h" struct VFIOContainer; struct VFIOContainerBase; @@ -17,6 +18,9 @@ struct VFIOGroup; typedef struct VFIOContainerCPR { Error *blocker; + bool vaddr_unmapped; + NotifierWithReturn transfer_notifier; + MemoryListener remap_listener; int (*saved_dma_map)(const struct VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly, MemoryRegion *mr); @@ -42,4 +46,10 @@ int vfio_cpr_group_get_device_fd(int d, const char *name); bool vfio_cpr_container_match(struct VFIOContainer *container, struct VFIOGroup *group, int fd); +void vfio_cpr_giommu_remap(struct VFIOContainerBase *bcontainer, + MemoryRegionSection *section); + +bool vfio_cpr_ram_discard_register_listener( + struct VFIOContainerBase *bcontainer, MemoryRegionSection *section); + #endif /* HW_VFIO_VFIO_CPR_H */ From 8df3fa3d6753332a64eca53780517972075966e1 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 10 Jun 2025 08:39:22 -0700 Subject: [PATCH 18/27] pci: export msix_is_pending MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export msix_is_pending for use by cpr. No functional change. Signed-off-by: Steve Sistare Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/qemu-devel/1749569991-25171-10-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/pci/msix.c | 2 +- include/hw/pci/msix.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/pci/msix.c b/hw/pci/msix.c index 66f27b9d71..8c7f6709e2 100644 --- a/hw/pci/msix.c +++ b/hw/pci/msix.c @@ -72,7 +72,7 @@ static uint8_t *msix_pending_byte(PCIDevice *dev, int vector) return dev->msix_pba + vector / 8; } -static int msix_is_pending(PCIDevice *dev, int vector) +int msix_is_pending(PCIDevice *dev, unsigned int vector) { return *msix_pending_byte(dev, vector) & msix_pending_mask(vector); } diff --git a/include/hw/pci/msix.h b/include/hw/pci/msix.h index 0e6f257e45..11ef9454c1 100644 --- a/include/hw/pci/msix.h +++ b/include/hw/pci/msix.h @@ -32,6 +32,7 @@ int msix_present(PCIDevice *dev); bool msix_is_masked(PCIDevice *dev, unsigned vector); void msix_set_pending(PCIDevice *dev, unsigned vector); void msix_clr_pending(PCIDevice *dev, int vector); +int msix_is_pending(PCIDevice *dev, unsigned vector); void msix_vector_use(PCIDevice *dev, unsigned vector); void msix_vector_unuse(PCIDevice *dev, unsigned vector); From 24c156dcd94299f64994170ce84efac9ae001f41 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 10 Jun 2025 10:26:43 -0700 Subject: [PATCH 19/27] pci: skip reset during cpr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not reset a vfio-pci device during CPR. Signed-off-by: Steve Sistare Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/qemu-devel/1749576403-25355-1-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/pci/pci.c | 5 +++++ hw/vfio/pci.c | 7 +++++++ include/hw/pci/pci.h | 2 ++ 3 files changed, 14 insertions(+) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 9b4bf48439..c70b5ceeba 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -32,6 +32,7 @@ #include "hw/pci/pci_host.h" #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" +#include "migration/cpr.h" #include "migration/qemu-file-types.h" #include "migration/vmstate.h" #include "net/net.h" @@ -537,6 +538,10 @@ static void pci_reset_regions(PCIDevice *dev) static void pci_do_device_reset(PCIDevice *dev) { + if ((dev->cap_present & QEMU_PCI_SKIP_RESET_ON_CPR) && cpr_is_incoming()) { + return; + } + pci_device_deassert_intx(dev); assert(dev->irq_state == 0); diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 89f9246416..b97261c61b 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3411,6 +3411,13 @@ static void vfio_instance_init(Object *obj) /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command * line, therefore, no need to wait to realize like other devices */ pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; + + /* + * A device that is resuming for cpr is already configured, so do not + * reset it during qemu_system_reset prior to cpr load, else interrupts + * may be lost. + */ + pci_dev->cap_present |= QEMU_PCI_SKIP_RESET_ON_CPR; } static void vfio_pci_base_dev_class_init(ObjectClass *klass, const void *data) diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 35d59d7672..df3cc7b875 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -222,6 +222,8 @@ enum { QEMU_PCIE_EXT_TAG = (1 << QEMU_PCIE_EXT_TAG_BITNR), #define QEMU_PCI_CAP_PM_BITNR 14 QEMU_PCI_CAP_PM = (1 << QEMU_PCI_CAP_PM_BITNR), +#define QEMU_PCI_SKIP_RESET_ON_CPR_BITNR 15 + QEMU_PCI_SKIP_RESET_ON_CPR = (1 << QEMU_PCI_SKIP_RESET_ON_CPR_BITNR), }; typedef struct PCIINTxRoute { From 031fbb7110a183053b431fb667867a5244910e75 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 10 Jun 2025 08:39:24 -0700 Subject: [PATCH 20/27] vfio-pci: skip reset during cpr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not reset a vfio-pci device during CPR, and do not complain if the kernel's PCI config space changes for non-emulated bits between the vmstate save and load, which can happen due to ongoing interrupt activity. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/1749569991-25171-12-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/cpr.c | 31 +++++++++++++++++++++++++++++++ hw/vfio/pci.c | 7 +++++++ include/hw/vfio/vfio-cpr.h | 2 ++ 3 files changed, 40 insertions(+) diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c index 0e59612228..fdbb58e203 100644 --- a/hw/vfio/cpr.c +++ b/hw/vfio/cpr.c @@ -8,6 +8,8 @@ #include "qemu/osdep.h" #include "hw/vfio/vfio-device.h" #include "hw/vfio/vfio-cpr.h" +#include "hw/vfio/pci.h" +#include "migration/cpr.h" #include "qapi/error.h" #include "system/runstate.h" @@ -37,3 +39,32 @@ void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer) { migration_remove_notifier(&bcontainer->cpr_reboot_notifier); } + +/* + * The kernel may change non-emulated config bits. Exclude them from the + * changed-bits check in get_pci_config_device. + */ +static int vfio_cpr_pci_pre_load(void *opaque) +{ + VFIOPCIDevice *vdev = opaque; + PCIDevice *pdev = &vdev->pdev; + int size = MIN(pci_config_size(pdev), vdev->config_size); + int i; + + for (i = 0; i < size; i++) { + pdev->cmask[i] &= vdev->emulated_config_bits[i]; + } + + return 0; +} + +const VMStateDescription vfio_cpr_pci_vmstate = { + .name = "vfio-cpr-pci", + .version_id = 0, + .minimum_version_id = 0, + .pre_load = vfio_cpr_pci_pre_load, + .needed = cpr_incoming_needed, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + } +}; diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index b97261c61b..2da5989581 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -30,6 +30,7 @@ #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" #include "migration/vmstate.h" +#include "migration/cpr.h" #include "qobject/qdict.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" @@ -3354,6 +3355,11 @@ static void vfio_pci_reset(DeviceState *dev) { VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev); + /* Do not reset the device during qemu_system_reset prior to cpr load */ + if (cpr_is_incoming()) { + return; + } + trace_vfio_pci_reset(vdev->vbasedev.name); vfio_pci_pre_reset(vdev); @@ -3530,6 +3536,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data) #ifdef CONFIG_IOMMUFD object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd); #endif + dc->vmsd = &vfio_cpr_pci_vmstate; dc->desc = "VFIO-based PCI device assignment"; pdc->realize = vfio_pci_realize; diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h index 56ede049ad..8bf85b9f4e 100644 --- a/include/hw/vfio/vfio-cpr.h +++ b/include/hw/vfio/vfio-cpr.h @@ -52,4 +52,6 @@ void vfio_cpr_giommu_remap(struct VFIOContainerBase *bcontainer, bool vfio_cpr_ram_discard_register_listener( struct VFIOContainerBase *bcontainer, MemoryRegionSection *section); +extern const VMStateDescription vfio_cpr_pci_vmstate; + #endif /* HW_VFIO_VFIO_CPR_H */ From 906f524ef19fa46140c3a5ae74d5e2e7d7b37ab2 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 10 Jun 2025 08:39:25 -0700 Subject: [PATCH 21/27] vfio/pci: vfio_pci_vector_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract a subroutine vfio_pci_vector_init. No functional change. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/1749569991-25171-13-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/pci.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 2da5989581..7a33c19ea2 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -531,6 +531,22 @@ static void set_irq_signalling(VFIODevice *vbasedev, VFIOMSIVector *vector, } } +static void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr) +{ + VFIOMSIVector *vector = &vdev->msi_vectors[nr]; + PCIDevice *pdev = &vdev->pdev; + + vector->vdev = vdev; + vector->virq = -1; + if (event_notifier_init(&vector->interrupt, 0)) { + error_report("vfio: Error: event_notifier_init failed"); + } + vector->use = true; + if (vdev->interrupt == VFIO_INT_MSIX) { + msix_vector_use(pdev, nr); + } +} + static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, MSIMessage *msg, IOHandler *handler) { @@ -544,13 +560,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, vector = &vdev->msi_vectors[nr]; if (!vector->use) { - vector->vdev = vdev; - vector->virq = -1; - if (event_notifier_init(&vector->interrupt, 0)) { - error_report("vfio: Error: event_notifier_init failed"); - } - vector->use = true; - msix_vector_use(pdev, nr); + vfio_pci_vector_init(vdev, nr); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), From 8f5c6960269124674895032f344fa71f8be5d49c Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 10 Jun 2025 08:39:26 -0700 Subject: [PATCH 22/27] vfio/pci: vfio_notifier_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move event_notifier_init calls to a helper vfio_notifier_init. This version is trivial, but it will be expanded to support CPR in subsequent patches. No functional change. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/1749569991-25171-14-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/pci.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 7a33c19ea2..dc0d9a7fbb 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -57,6 +57,16 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); static void vfio_msi_disable_common(VFIOPCIDevice *vdev); +static bool vfio_notifier_init(EventNotifier *e, const char *name, Error **errp) +{ + int ret = event_notifier_init(e, 0); + + if (ret) { + error_setg_errno(errp, -ret, "vfio_notifier_init %s failed", name); + } + return !ret; +} + /* * Disabling BAR mmaping can be slow, but toggling it around INTx can * also be a huge overhead. We try to get the best of both worlds by @@ -137,8 +147,7 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) pci_irq_deassert(&vdev->pdev); /* Get an eventfd for resample/unmask */ - if (event_notifier_init(&vdev->intx.unmask, 0)) { - error_setg(errp, "event_notifier_init failed eoi"); + if (!vfio_notifier_init(&vdev->intx.unmask, "intx-unmask", errp)) { goto fail; } @@ -269,7 +278,6 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1); Error *err = NULL; int32_t fd; - int ret; if (!pin) { @@ -292,9 +300,7 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) } #endif - ret = event_notifier_init(&vdev->intx.interrupt, 0); - if (ret) { - error_setg_errno(errp, -ret, "event_notifier_init failed"); + if (!vfio_notifier_init(&vdev->intx.interrupt, "intx-interrupt", errp)) { return false; } fd = event_notifier_get_fd(&vdev->intx.interrupt); @@ -474,11 +480,13 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector) { + const char *name = "kvm_interrupt"; + if (vector->virq < 0) { return; } - if (event_notifier_init(&vector->kvm_interrupt, 0)) { + if (!vfio_notifier_init(&vector->kvm_interrupt, name, NULL)) { goto fail_notifier; } @@ -535,11 +543,12 @@ static void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr) { VFIOMSIVector *vector = &vdev->msi_vectors[nr]; PCIDevice *pdev = &vdev->pdev; + Error *err = NULL; vector->vdev = vdev; vector->virq = -1; - if (event_notifier_init(&vector->interrupt, 0)) { - error_report("vfio: Error: event_notifier_init failed"); + if (!vfio_notifier_init(&vector->interrupt, "interrupt", &err)) { + error_report_err(err); } vector->use = true; if (vdev->interrupt == VFIO_INT_MSIX) { @@ -755,13 +764,14 @@ retry: for (i = 0; i < vdev->nr_vectors; i++) { VFIOMSIVector *vector = &vdev->msi_vectors[i]; + Error *err = NULL; vector->vdev = vdev; vector->virq = -1; vector->use = true; - if (event_notifier_init(&vector->interrupt, 0)) { - error_report("vfio: Error: event_notifier_init failed"); + if (!vfio_notifier_init(&vector->interrupt, "interrupt", &err)) { + error_report_err(err); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), @@ -2928,8 +2938,8 @@ void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev) return; } - if (event_notifier_init(&vdev->err_notifier, 0)) { - error_report("vfio: Unable to init event notifier for error detection"); + if (!vfio_notifier_init(&vdev->err_notifier, "err_notifier", &err)) { + error_report_err(err); vdev->pci_aer = false; return; } @@ -2995,8 +3005,8 @@ void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev) return; } - if (event_notifier_init(&vdev->req_notifier, 0)) { - error_report("vfio: Unable to init event notifier for device request"); + if (!vfio_notifier_init(&vdev->req_notifier, "req_notifier", &err)) { + error_report_err(err); return; } From d364d802fe4a5d73b2e88cb9b80799692f1ee8a5 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 10 Jun 2025 08:39:27 -0700 Subject: [PATCH 23/27] vfio/pci: pass vector to virq functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass the vector number to vfio_connect_kvm_msi_virq and vfio_remove_kvm_msi_virq, so it can be passed to their subroutines in a subsequent patch. No functional change. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/1749569991-25171-15-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/pci.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index dc0d9a7fbb..1d22c41efb 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -478,7 +478,7 @@ static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, vector_n, &vdev->pdev); } -static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector) +static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector, int nr) { const char *name = "kvm_interrupt"; @@ -504,7 +504,8 @@ fail_notifier: vector->virq = -1; } -static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) +static void vfio_remove_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int nr) { kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, vector->virq); @@ -581,7 +582,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, */ if (vector->virq >= 0) { if (!msg) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, nr); } else { vfio_update_kvm_msi_virq(vector, *msg, pdev); } @@ -593,7 +594,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state); vfio_add_kvm_msi_virq(vdev, vector, nr, true); kvm_irqchip_commit_route_changes(&vfio_route_change); - vfio_connect_kvm_msi_virq(vector); + vfio_connect_kvm_msi_virq(vector, nr); } } } @@ -687,7 +688,7 @@ static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) kvm_irqchip_commit_route_changes(&vfio_route_change); for (i = 0; i < vdev->nr_vectors; i++) { - vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i]); + vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i], i); } } @@ -827,7 +828,7 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev) VFIOMSIVector *vector = &vdev->msi_vectors[i]; if (vdev->msi_vectors[i].use) { if (vector->virq >= 0) { - vfio_remove_kvm_msi_virq(vector); + vfio_remove_kvm_msi_virq(vdev, vector, i); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), NULL, NULL, NULL); From c2559182c88ab997727e0f4ef2c0f7faeb24d5a9 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 10 Jun 2025 08:39:28 -0700 Subject: [PATCH 24/27] vfio/pci: vfio_notifier_init cpr parameters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass vdev and nr to vfio_notifier_init, for use by CPR in a subsequent patch. No functional change. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/1749569991-25171-16-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/pci.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 1d22c41efb..9ee36da626 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -57,7 +57,8 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); static void vfio_msi_disable_common(VFIOPCIDevice *vdev); -static bool vfio_notifier_init(EventNotifier *e, const char *name, Error **errp) +static bool vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e, + const char *name, int nr, Error **errp) { int ret = event_notifier_init(e, 0); @@ -147,7 +148,7 @@ static bool vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) pci_irq_deassert(&vdev->pdev); /* Get an eventfd for resample/unmask */ - if (!vfio_notifier_init(&vdev->intx.unmask, "intx-unmask", errp)) { + if (!vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0, errp)) { goto fail; } @@ -300,7 +301,8 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) } #endif - if (!vfio_notifier_init(&vdev->intx.interrupt, "intx-interrupt", errp)) { + if (!vfio_notifier_init(vdev, &vdev->intx.interrupt, "intx-interrupt", 0, + errp)) { return false; } fd = event_notifier_get_fd(&vdev->intx.interrupt); @@ -486,7 +488,8 @@ static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector, int nr) return; } - if (!vfio_notifier_init(&vector->kvm_interrupt, name, NULL)) { + if (!vfio_notifier_init(vector->vdev, &vector->kvm_interrupt, name, nr, + NULL)) { goto fail_notifier; } @@ -544,12 +547,13 @@ static void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr) { VFIOMSIVector *vector = &vdev->msi_vectors[nr]; PCIDevice *pdev = &vdev->pdev; - Error *err = NULL; + Error *local_err = NULL; vector->vdev = vdev; vector->virq = -1; - if (!vfio_notifier_init(&vector->interrupt, "interrupt", &err)) { - error_report_err(err); + if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", nr, + &local_err)) { + error_report_err(local_err); } vector->use = true; if (vdev->interrupt == VFIO_INT_MSIX) { @@ -765,14 +769,15 @@ retry: for (i = 0; i < vdev->nr_vectors; i++) { VFIOMSIVector *vector = &vdev->msi_vectors[i]; - Error *err = NULL; + Error *local_err = NULL; vector->vdev = vdev; vector->virq = -1; vector->use = true; - if (!vfio_notifier_init(&vector->interrupt, "interrupt", &err)) { - error_report_err(err); + if (!vfio_notifier_init(vdev, &vector->interrupt, "interrupt", i, + &local_err)) { + error_report_err(local_err); } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), @@ -2939,7 +2944,8 @@ void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev) return; } - if (!vfio_notifier_init(&vdev->err_notifier, "err_notifier", &err)) { + if (!vfio_notifier_init(vdev, &vdev->err_notifier, "err_notifier", 0, + &err)) { error_report_err(err); vdev->pci_aer = false; return; @@ -3006,7 +3012,8 @@ void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev) return; } - if (!vfio_notifier_init(&vdev->req_notifier, "req_notifier", &err)) { + if (!vfio_notifier_init(vdev, &vdev->req_notifier, "req_notifier", 0, + &err)) { error_report_err(err); return; } From 6d7696f329c77332e4db1d18fc5ab5df23363e01 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 10 Jun 2025 08:39:29 -0700 Subject: [PATCH 25/27] vfio/pci: vfio_notifier_cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move event_notifier_cleanup calls to a helper vfio_notifier_cleanup. This version is trivial, and does not yet use the vdev and nr parameters. No functional change. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/1749569991-25171-17-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/pci.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 9ee36da626..06a7a63cf5 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -68,6 +68,12 @@ static bool vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e, return !ret; } +static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e, + const char *name, int nr) +{ + event_notifier_cleanup(e); +} + /* * Disabling BAR mmaping can be slow, but toggling it around INTx can * also be a huge overhead. We try to get the best of both worlds by @@ -180,7 +186,7 @@ fail_vfio: kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt, vdev->intx.route.irq); fail_irqfd: - event_notifier_cleanup(&vdev->intx.unmask); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); fail: qemu_set_fd_handler(irq_fd, vfio_intx_interrupt, NULL, vdev); vfio_device_irq_unmask(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX); @@ -212,7 +218,7 @@ static void vfio_intx_disable_kvm(VFIOPCIDevice *vdev) } /* We only need to close the eventfd for VFIO to cleanup the kernel side */ - event_notifier_cleanup(&vdev->intx.unmask); + vfio_notifier_cleanup(vdev, &vdev->intx.unmask, "intx-unmask", 0); /* QEMU starts listening for interrupt events. */ qemu_set_fd_handler(event_notifier_get_fd(&vdev->intx.interrupt), @@ -311,7 +317,7 @@ static bool vfio_intx_enable(VFIOPCIDevice *vdev, Error **errp) if (!vfio_device_irq_set_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->intx.interrupt); + vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); return false; } @@ -338,7 +344,7 @@ static void vfio_intx_disable(VFIOPCIDevice *vdev) fd = event_notifier_get_fd(&vdev->intx.interrupt); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->intx.interrupt); + vfio_notifier_cleanup(vdev, &vdev->intx.interrupt, "intx-interrupt", 0); vdev->interrupt = VFIO_INT_NONE; @@ -501,7 +507,7 @@ static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector, int nr) return; fail_kvm: - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vector->vdev, &vector->kvm_interrupt, name, nr); fail_notifier: kvm_irqchip_release_virq(kvm_state, vector->virq); vector->virq = -1; @@ -514,7 +520,7 @@ static void vfio_remove_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, vector->virq); kvm_irqchip_release_virq(kvm_state, vector->virq); vector->virq = -1; - event_notifier_cleanup(&vector->kvm_interrupt); + vfio_notifier_cleanup(vdev, &vector->kvm_interrupt, "kvm_interrupt", nr); } static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, @@ -837,7 +843,7 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev) } qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), NULL, NULL, NULL); - event_notifier_cleanup(&vector->interrupt); + vfio_notifier_cleanup(vdev, &vector->interrupt, "interrupt", i); } } @@ -2958,7 +2964,7 @@ void vfio_pci_register_err_notifier(VFIOPCIDevice *vdev) VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->err_notifier); + vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0); vdev->pci_aer = false; } } @@ -2977,7 +2983,7 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev) } qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier), NULL, NULL, vdev); - event_notifier_cleanup(&vdev->err_notifier); + vfio_notifier_cleanup(vdev, &vdev->err_notifier, "err_notifier", 0); } static void vfio_req_notifier_handler(void *opaque) @@ -3025,7 +3031,7 @@ void vfio_pci_register_req_notifier(VFIOPCIDevice *vdev) VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); qemu_set_fd_handler(fd, NULL, NULL, vdev); - event_notifier_cleanup(&vdev->req_notifier); + vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0); } else { vdev->req_enabled = true; } @@ -3045,7 +3051,7 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) } qemu_set_fd_handler(event_notifier_get_fd(&vdev->req_notifier), NULL, NULL, vdev); - event_notifier_cleanup(&vdev->req_notifier); + vfio_notifier_cleanup(vdev, &vdev->req_notifier, "req_notifier", 0); vdev->req_enabled = false; } From 6f06e3729a6a8b6fc3cae6e91321014ea4c7f85f Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 10 Jun 2025 08:39:30 -0700 Subject: [PATCH 26/27] vfio/pci: export MSI functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export various MSI functions, renamed with a vfio_pci prefix, for use by CPR in subsequent patches. No functional change. Signed-off-by: Steve Sistare Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/1749569991-25171-18-git-send-email-steven.sistare@oracle.com Signed-off-by: Cédric Le Goater --- hw/vfio/pci.c | 29 +++++++++++++++++------------ hw/vfio/pci.h | 8 ++++++++ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 06a7a63cf5..fa25bded25 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -351,6 +351,11 @@ static void vfio_intx_disable(VFIOPCIDevice *vdev) trace_vfio_intx_disable(vdev->vbasedev.name); } +bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp) +{ + return vfio_intx_enable(vdev, errp); +} + /* * MSI/X */ @@ -475,8 +480,8 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) return ret; } -static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, - int vector_n, bool msix) +void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int vector_n, bool msix) { if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) { return; @@ -549,7 +554,7 @@ static void set_irq_signalling(VFIODevice *vbasedev, VFIOMSIVector *vector, } } -static void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr) +void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr) { VFIOMSIVector *vector = &vdev->msi_vectors[nr]; PCIDevice *pdev = &vdev->pdev; @@ -599,10 +604,10 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, } else { if (msg) { if (vdev->defer_kvm_irq_routing) { - vfio_add_kvm_msi_virq(vdev, vector, nr, true); + vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true); } else { vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state); - vfio_add_kvm_msi_virq(vdev, vector, nr, true); + vfio_pci_add_kvm_msi_virq(vdev, vector, nr, true); kvm_irqchip_commit_route_changes(&vfio_route_change); vfio_connect_kvm_msi_virq(vector, nr); } @@ -681,14 +686,14 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) } } -static void vfio_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev) +void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev) { assert(!vdev->defer_kvm_irq_routing); vdev->defer_kvm_irq_routing = true; vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state); } -static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) +void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) { int i; @@ -718,14 +723,14 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) * routes once rather than per vector provides a substantial * performance improvement. */ - vfio_prepare_kvm_msi_virq_batch(vdev); + vfio_pci_prepare_kvm_msi_virq_batch(vdev); if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, vfio_msix_vector_release, NULL)) { error_report("vfio: msix_set_vector_notifiers failed"); } - vfio_commit_kvm_msi_virq_batch(vdev); + vfio_pci_commit_kvm_msi_virq_batch(vdev); if (vdev->nr_vectors) { ret = vfio_enable_vectors(vdev, true); @@ -769,7 +774,7 @@ retry: * Deferring to commit the KVM routes once rather than per vector * provides a substantial performance improvement. */ - vfio_prepare_kvm_msi_virq_batch(vdev); + vfio_pci_prepare_kvm_msi_virq_batch(vdev); vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors); @@ -793,10 +798,10 @@ retry: * Attempt to enable route through KVM irqchip, * default to userspace handling if unavailable. */ - vfio_add_kvm_msi_virq(vdev, vector, i, false); + vfio_pci_add_kvm_msi_virq(vdev, vector, i, false); } - vfio_commit_kvm_msi_virq_batch(vdev); + vfio_pci_commit_kvm_msi_virq_batch(vdev); /* Set interrupt type prior to possible interrupts */ vdev->interrupt = VFIO_INT_MSI; diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index d4c6b2e7b7..d3dc2274a9 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -210,6 +210,14 @@ static inline bool vfio_is_vga(VFIOPCIDevice *vdev) return class == PCI_CLASS_DISPLAY_VGA; } +/* MSI/MSI-X/INTx */ +void vfio_pci_vector_init(VFIOPCIDevice *vdev, int nr); +void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, + int vector_n, bool msix); +void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev); +void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev); +bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp); + uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, uint32_t val, int len); From 079e7216debd767e78a77aefc88e2e7335f49b26 Mon Sep 17 00:00:00 2001 From: John Levon Date: Wed, 11 Jun 2025 03:47:53 -0700 Subject: [PATCH 27/27] vfio: improve VFIODeviceIOOps docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Explicitly describe every parameter rather than summarizing. Signed-off-by: John Levon Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250611104753.1199796-1-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- include/hw/vfio/vfio-device.h | 52 +++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index f39259406b..d45e5a68a2 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -168,14 +168,25 @@ struct VFIODeviceIOOps { * @device_feature * * Fill in feature info for the given device. + * + * @vdev: #VFIODevice to use + * @feat: feature information to fill in + * + * Returns 0 on success or -errno. */ - int (*device_feature)(VFIODevice *vdev, struct vfio_device_feature *); + int (*device_feature)(VFIODevice *vdev, struct vfio_device_feature *feat); /** * @get_region_info * - * Fill in @info (and optionally @fd) with information on the region given - * by @info->index. + * Get the information for a given region on the device. + * + * @vdev: #VFIODevice to use + * @info: set @info->index to the region index to look up; the rest of the + * struct will be filled in on success + * @fd: pointer to the fd for the region; will be -1 if not found + * + * Returns 0 on success or -errno. */ int (*get_region_info)(VFIODevice *vdev, struct vfio_region_info *info, int *fd); @@ -183,22 +194,38 @@ struct VFIODeviceIOOps { /** * @get_irq_info * - * Fill in @irq with information on the IRQ given by @info->index. + * @vdev: #VFIODevice to use + * @irq: set @irq->index to the IRQ index to look up; the rest of the struct + * will be filled in on success + * + * Returns 0 on success or -errno. */ int (*get_irq_info)(VFIODevice *vdev, struct vfio_irq_info *irq); /** * @set_irqs * - * Configure IRQs as defined by @irqs. + * Configure IRQs. + * + * @vdev: #VFIODevice to use + * @irqs: IRQ configuration as defined by VFIO docs. + * + * Returns 0 on success or -errno. */ int (*set_irqs)(VFIODevice *vdev, struct vfio_irq_set *irqs); /** * @region_read * - * Read @size bytes from the region @nr at offset @off into the buffer - * @data. + * Read part of a region. + * + * @vdev: #VFIODevice to use + * @nr: region index + * @off: offset within the region + * @size: size in bytes to read + * @data: buffer to read into + * + * Returns number of bytes read on success or -errno. */ int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size, void *data); @@ -206,8 +233,15 @@ struct VFIODeviceIOOps { /** * @region_write * - * Write @size bytes to the region @nr at offset @off from the buffer - * @data; if @post, the write is posted. + * Write part of a region. + * + * @vdev: #VFIODevice to use + * @nr: region index + * @off: offset within the region + * @size: size in bytes to write + * @data: buffer to write from + * + * Returns number of bytes write on success or -errno. */ int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size, void *data, bool post);