From 0f1f73bb37bb9e93d108e044224ecad9e4ecb3a5 Mon Sep 17 00:00:00 2001 From: Alejandro Jimenez Date: Mon, 3 Nov 2025 20:32:07 +0000 Subject: [PATCH 01/14] MAINTAINERS: Update entry for AMD-Vi Emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add myself as maintainer and Sairaj Kodilkar as reviewer. Reviewed-by: Cédric Le Goater Acked-by: Sairaj Kodilkar Signed-off-by: Alejandro Jimenez Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Message-Id: <20251103203209.645434-2-alejandro.j.jimenez@oracle.com> --- MAINTAINERS | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index be6efff80c..9cb181e1da 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3933,8 +3933,10 @@ F: tests/functional/x86_64/test_intel_iommu.py F: tests/qtest/intel-iommu-test.c AMD-Vi Emulation -S: Orphan -F: hw/i386/amd_iommu.? +M: Alejandro Jimenez +R: Sairaj Kodilkar +S: Supported +F: hw/i386/amd_iommu* OpenSBI Firmware L: qemu-riscv@nongnu.org From 27d6a0ec0beec45c35a70cd5f12c4530725dda2a Mon Sep 17 00:00:00 2001 From: Sairaj Kodilkar Date: Mon, 3 Nov 2025 20:32:08 +0000 Subject: [PATCH 02/14] amd_iommu: Fix handling of devices on buses != 0 The AMD IOMMU is set up at boot time and uses PCI bus numbers + devfn for indexing into the DTE. The problem is that before the guest starts, all PCI bus numbers are 0 as no PCI discovery has happened yet (BIOS and/or kernel will do that later), so relying on the bus number is wrong. The immediate effect is that emulated devices cannot do DMA when placed on a bus other than 0. Replace the static address_space array with a hash table keyed by devfn and PCIBus*, since these values do not change after the guest boots. Co-developed-by: Alexey Kardashevskiy Signed-off-by: Alexey Kardashevskiy Signed-off-by: Sairaj Kodilkar Reviewed-by: Vasant Hegde Reviewed-by: Alejandro Jimenez Tested-by: Alejandro Jimenez Signed-off-by: Alejandro Jimenez Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Message-Id: <20251103203209.645434-3-alejandro.j.jimenez@oracle.com> --- hw/i386/amd_iommu.c | 134 ++++++++++++++++++++++++++------------------ hw/i386/amd_iommu.h | 2 +- 2 files changed, 79 insertions(+), 57 deletions(-) diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 378e0cb55e..5c5cfd4989 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -59,7 +59,7 @@ const char *amdvi_mmio_high[] = { }; struct AMDVIAddressSpace { - uint8_t bus_num; /* bus number */ + PCIBus *bus; /* PCIBus (for bus number) */ uint8_t devfn; /* device function */ AMDVIState *iommu_state; /* AMDVI - one per machine */ MemoryRegion root; /* AMDVI Root memory map region */ @@ -101,6 +101,11 @@ typedef enum AMDVIFaultReason { AMDVI_FR_PT_ENTRY_INV, /* Failure to read PTE from guest memory */ } AMDVIFaultReason; +typedef struct AMDVIAsKey { + PCIBus *bus; + uint8_t devfn; +} AMDVIAsKey; + uint64_t amdvi_extended_feature_register(AMDVIState *s) { uint64_t feature = AMDVI_DEFAULT_EXT_FEATURES; @@ -382,6 +387,44 @@ static guint amdvi_uint64_hash(gconstpointer v) return (guint)*(const uint64_t *)v; } +static gboolean amdvi_as_equal(gconstpointer v1, gconstpointer v2) +{ + const AMDVIAsKey *key1 = v1; + const AMDVIAsKey *key2 = v2; + + return key1->bus == key2->bus && key1->devfn == key2->devfn; +} + +static guint amdvi_as_hash(gconstpointer v) +{ + const AMDVIAsKey *key = v; + guint bus = (guint)(uintptr_t)key->bus; + + return (guint)(bus << 8 | (guint)key->devfn); +} + +static AMDVIAddressSpace *amdvi_as_lookup(AMDVIState *s, PCIBus *bus, + uint8_t devfn) +{ + const AMDVIAsKey key = { .bus = bus, .devfn = devfn }; + return g_hash_table_lookup(s->address_spaces, &key); +} + +static gboolean amdvi_find_as_by_devid(gpointer key, gpointer value, + gpointer user_data) +{ + const AMDVIAsKey *as = key; + const uint16_t *devidp = user_data; + + return *devidp == PCI_BUILD_BDF(pci_bus_num(as->bus), as->devfn); +} + +static AMDVIAddressSpace *amdvi_get_as_by_devid(AMDVIState *s, uint16_t devid) +{ + return g_hash_table_find(s->address_spaces, + amdvi_find_as_by_devid, &devid); +} + static AMDVIIOTLBEntry *amdvi_iotlb_lookup(AMDVIState *s, hwaddr addr, uint64_t devid) { @@ -551,7 +594,7 @@ static inline uint64_t amdvi_get_pte_entry(AMDVIState *s, uint64_t pte_addr, static int amdvi_as_to_dte(AMDVIAddressSpace *as, uint64_t *dte) { - uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn); + uint16_t devid = PCI_BUILD_BDF(pci_bus_num(as->bus), as->devfn); AMDVIState *s = as->iommu_state; if (!amdvi_get_dte(s, devid, dte)) { @@ -1011,25 +1054,15 @@ static void amdvi_switch_address_space(AMDVIAddressSpace *amdvi_as) */ static void amdvi_reset_address_translation_all(AMDVIState *s) { - AMDVIAddressSpace **iommu_as; + AMDVIAddressSpace *iommu_as; + GHashTableIter as_it; - for (int bus_num = 0; bus_num < PCI_BUS_MAX; bus_num++) { + g_hash_table_iter_init(&as_it, s->address_spaces); - /* Nothing to do if there are no devices on the current bus */ - if (!s->address_spaces[bus_num]) { - continue; - } - iommu_as = s->address_spaces[bus_num]; - - for (int devfn = 0; devfn < PCI_DEVFN_MAX; devfn++) { - - if (!iommu_as[devfn]) { - continue; - } - /* Use passthrough as default mode after reset */ - iommu_as[devfn]->addr_translation = false; - amdvi_switch_address_space(iommu_as[devfn]); - } + while (g_hash_table_iter_next(&as_it, NULL, (void **)&iommu_as)) { + /* Use passthrough as default mode after reset */ + iommu_as->addr_translation = false; + amdvi_switch_address_space(iommu_as); } } @@ -1089,27 +1122,15 @@ static void enable_nodma_mode(AMDVIAddressSpace *as) */ static void amdvi_update_addr_translation_mode(AMDVIState *s, uint16_t devid) { - uint8_t bus_num, devfn, dte_mode; + uint8_t dte_mode; AMDVIAddressSpace *as; uint64_t dte[4] = { 0 }; int ret; - /* - * Convert the devid encoded in the command to a bus and devfn in - * order to retrieve the corresponding address space. - */ - bus_num = PCI_BUS_NUM(devid); - devfn = devid & 0xff; - - /* - * The main buffer of size (AMDVIAddressSpace *) * (PCI_BUS_MAX) has already - * been allocated within AMDVIState, but must be careful to not access - * unallocated devfn. - */ - if (!s->address_spaces[bus_num] || !s->address_spaces[bus_num][devfn]) { + as = amdvi_get_as_by_devid(s, devid); + if (!as) { return; } - as = s->address_spaces[bus_num][devfn]; ret = amdvi_as_to_dte(as, dte); @@ -1783,7 +1804,7 @@ static void amdvi_do_translate(AMDVIAddressSpace *as, hwaddr addr, bool is_write, IOMMUTLBEntry *ret) { AMDVIState *s = as->iommu_state; - uint16_t devid = PCI_BUILD_BDF(as->bus_num, as->devfn); + uint16_t devid = PCI_BUILD_BDF(pci_bus_num(as->bus), as->devfn); AMDVIIOTLBEntry *iotlb_entry = amdvi_iotlb_lookup(s, addr, devid); uint64_t entry[4]; int dte_ret; @@ -1858,7 +1879,7 @@ static IOMMUTLBEntry amdvi_translate(IOMMUMemoryRegion *iommu, hwaddr addr, } amdvi_do_translate(as, addr, flag & IOMMU_WO, &ret); - trace_amdvi_translation_result(as->bus_num, PCI_SLOT(as->devfn), + trace_amdvi_translation_result(pci_bus_num(as->bus), PCI_SLOT(as->devfn), PCI_FUNC(as->devfn), addr, ret.translated_addr); return ret; } @@ -2222,30 +2243,28 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) { char name[128]; AMDVIState *s = opaque; - AMDVIAddressSpace **iommu_as, *amdvi_dev_as; - int bus_num = pci_bus_num(bus); + AMDVIAddressSpace *amdvi_dev_as; + AMDVIAsKey *key; - iommu_as = s->address_spaces[bus_num]; + amdvi_dev_as = amdvi_as_lookup(s, bus, devfn); /* allocate memory during the first run */ - if (!iommu_as) { - iommu_as = g_new0(AMDVIAddressSpace *, PCI_DEVFN_MAX); - s->address_spaces[bus_num] = iommu_as; - } - - /* set up AMD-Vi region */ - if (!iommu_as[devfn]) { + if (!amdvi_dev_as) { snprintf(name, sizeof(name), "amd_iommu_devfn_%d", devfn); - iommu_as[devfn] = g_new0(AMDVIAddressSpace, 1); - iommu_as[devfn]->bus_num = (uint8_t)bus_num; - iommu_as[devfn]->devfn = (uint8_t)devfn; - iommu_as[devfn]->iommu_state = s; - iommu_as[devfn]->notifier_flags = IOMMU_NOTIFIER_NONE; - iommu_as[devfn]->iova_tree = iova_tree_new(); - iommu_as[devfn]->addr_translation = false; + amdvi_dev_as = g_new0(AMDVIAddressSpace, 1); + key = g_new0(AMDVIAsKey, 1); - amdvi_dev_as = iommu_as[devfn]; + amdvi_dev_as->bus = bus; + amdvi_dev_as->devfn = (uint8_t)devfn; + amdvi_dev_as->iommu_state = s; + amdvi_dev_as->notifier_flags = IOMMU_NOTIFIER_NONE; + amdvi_dev_as->iova_tree = iova_tree_new(); + amdvi_dev_as->addr_translation = false; + key->bus = bus; + key->devfn = devfn; + + g_hash_table_insert(s->address_spaces, key, amdvi_dev_as); /* * Memory region relationships looks like (Address range shows @@ -2288,7 +2307,7 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) amdvi_switch_address_space(amdvi_dev_as); } - return &iommu_as[devfn]->as; + return &amdvi_dev_as->as; } static const PCIIOMMUOps amdvi_iommu_ops = { @@ -2329,7 +2348,7 @@ static int amdvi_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, if (!s->dma_remap && (new & IOMMU_NOTIFIER_MAP)) { error_setg_errno(errp, ENOTSUP, "device %02x.%02x.%x requires dma-remap=1", - as->bus_num, PCI_SLOT(as->devfn), PCI_FUNC(as->devfn)); + pci_bus_num(as->bus), PCI_SLOT(as->devfn), PCI_FUNC(as->devfn)); return -ENOTSUP; } @@ -2510,6 +2529,9 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) s->iotlb = g_hash_table_new_full(amdvi_uint64_hash, amdvi_uint64_equal, g_free, g_free); + s->address_spaces = g_hash_table_new_full(amdvi_as_hash, + amdvi_as_equal, g_free, g_free); + /* set up MMIO */ memory_region_init_io(&s->mr_mmio, OBJECT(s), &mmio_mem_ops, s, "amdvi-mmio", AMDVI_MMIO_SIZE); diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h index daf82fc85f..38471b95d1 100644 --- a/hw/i386/amd_iommu.h +++ b/hw/i386/amd_iommu.h @@ -408,7 +408,7 @@ struct AMDVIState { bool mmio_enabled; /* for each served device */ - AMDVIAddressSpace **address_spaces[PCI_BUS_MAX]; + GHashTable *address_spaces; /* list of address spaces with registered notifiers */ QLIST_HEAD(, AMDVIAddressSpace) amdvi_as_with_notifiers; From 637dc49a3ae6b72cdb6415dcdcdb22ff770f2c8d Mon Sep 17 00:00:00 2001 From: Sairaj Kodilkar Date: Mon, 3 Nov 2025 20:32:09 +0000 Subject: [PATCH 03/14] amd_iommu: Support 64-bit address for IOTLB lookup The physical AMD IOMMU supports up to 64 bits of IOVA. When a device tries to read or write from a given DMA address, the IOMMU translates the address using the I/O page tables assigned to that device. Since the emulated IOMMU uses per-device page tables, an ideal cache tag would need to be 68 bits (64-bit address - 12-bit page alignment + 16-bit device ID). The current software IOTLB implementation uses a GLib hash table with a 64-bit key to hash both the IOVA and device ID, which limits the IOVA to 60 bits. This causes a failure while setting up the device when a guest is booted with "iommu.forcedac=1", which forces the use of DMA addresses at the top of the 64-bit address space. To address this issue, construct the 64-bit hash key using the upper 52 bits of IOVA (GFN) and lower 12 bits of the device ID to avoid truncation as much as possible (reducing hash collisions). Fixes: d29a09ca6842 ("hw/i386: Introduce AMD IOMMU") Signed-off-by: Sairaj Kodilkar Reviewed-by: Vasant Hegde Reviewed-by: Alejandro Jimenez Tested-by: Alejandro Jimenez Signed-off-by: Alejandro Jimenez Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Message-Id: <20251103203209.645434-4-alejandro.j.jimenez@oracle.com> --- hw/i386/amd_iommu.c | 57 ++++++++++++++++++++++++++++++--------------- hw/i386/amd_iommu.h | 4 ++-- 2 files changed, 40 insertions(+), 21 deletions(-) diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 5c5cfd4989..d689a06eca 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -106,6 +106,11 @@ typedef struct AMDVIAsKey { uint8_t devfn; } AMDVIAsKey; +typedef struct AMDVIIOTLBKey { + uint64_t gfn; + uint16_t devid; +} AMDVIIOTLBKey; + uint64_t amdvi_extended_feature_register(AMDVIState *s) { uint64_t feature = AMDVI_DEFAULT_EXT_FEATURES; @@ -377,16 +382,6 @@ static void amdvi_log_pagetab_error(AMDVIState *s, uint16_t devid, PCI_STATUS_SIG_TARGET_ABORT); } -static gboolean amdvi_uint64_equal(gconstpointer v1, gconstpointer v2) -{ - return *((const uint64_t *)v1) == *((const uint64_t *)v2); -} - -static guint amdvi_uint64_hash(gconstpointer v) -{ - return (guint)*(const uint64_t *)v; -} - static gboolean amdvi_as_equal(gconstpointer v1, gconstpointer v2) { const AMDVIAsKey *key1 = v1; @@ -425,11 +420,30 @@ static AMDVIAddressSpace *amdvi_get_as_by_devid(AMDVIState *s, uint16_t devid) amdvi_find_as_by_devid, &devid); } +static gboolean amdvi_iotlb_equal(gconstpointer v1, gconstpointer v2) +{ + const AMDVIIOTLBKey *key1 = v1; + const AMDVIIOTLBKey *key2 = v2; + + return key1->devid == key2->devid && key1->gfn == key2->gfn; +} + +static guint amdvi_iotlb_hash(gconstpointer v) +{ + const AMDVIIOTLBKey *key = v; + /* Use GPA and DEVID to find the bucket */ + return (guint)(key->gfn << AMDVI_PAGE_SHIFT_4K | + (key->devid & ~AMDVI_PAGE_MASK_4K)); +} + + static AMDVIIOTLBEntry *amdvi_iotlb_lookup(AMDVIState *s, hwaddr addr, uint64_t devid) { - uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) | - ((uint64_t)(devid) << AMDVI_DEVID_SHIFT); + AMDVIIOTLBKey key = { + .gfn = AMDVI_GET_IOTLB_GFN(addr), + .devid = devid, + }; return g_hash_table_lookup(s->iotlb, &key); } @@ -451,8 +465,10 @@ static gboolean amdvi_iotlb_remove_by_devid(gpointer key, gpointer value, static void amdvi_iotlb_remove_page(AMDVIState *s, hwaddr addr, uint64_t devid) { - uint64_t key = (addr >> AMDVI_PAGE_SHIFT_4K) | - ((uint64_t)(devid) << AMDVI_DEVID_SHIFT); + AMDVIIOTLBKey key = { + .gfn = AMDVI_GET_IOTLB_GFN(addr), + .devid = devid, + }; g_hash_table_remove(s->iotlb, &key); } @@ -463,8 +479,10 @@ static void amdvi_update_iotlb(AMDVIState *s, uint16_t devid, /* don't cache erroneous translations */ if (to_cache.perm != IOMMU_NONE) { AMDVIIOTLBEntry *entry = g_new(AMDVIIOTLBEntry, 1); - uint64_t *key = g_new(uint64_t, 1); - uint64_t gfn = gpa >> AMDVI_PAGE_SHIFT_4K; + AMDVIIOTLBKey *key = g_new(AMDVIIOTLBKey, 1); + + key->gfn = AMDVI_GET_IOTLB_GFN(gpa); + key->devid = devid; trace_amdvi_cache_update(domid, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), gpa, to_cache.translated_addr); @@ -477,7 +495,8 @@ static void amdvi_update_iotlb(AMDVIState *s, uint16_t devid, entry->perms = to_cache.perm; entry->translated_addr = to_cache.translated_addr; entry->page_mask = to_cache.addr_mask; - *key = gfn | ((uint64_t)(devid) << AMDVI_DEVID_SHIFT); + entry->devid = devid; + g_hash_table_replace(s->iotlb, key, entry); } } @@ -2526,8 +2545,8 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) } } - s->iotlb = g_hash_table_new_full(amdvi_uint64_hash, - amdvi_uint64_equal, g_free, g_free); + s->iotlb = g_hash_table_new_full(amdvi_iotlb_hash, + amdvi_iotlb_equal, g_free, g_free); s->address_spaces = g_hash_table_new_full(amdvi_as_hash, amdvi_as_equal, g_free, g_free); diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h index 38471b95d1..302ccca512 100644 --- a/hw/i386/amd_iommu.h +++ b/hw/i386/amd_iommu.h @@ -220,8 +220,8 @@ #define PAGE_SIZE_PTE_COUNT(pgsz) (1ULL << ((ctz64(pgsz) - 12) % 9)) /* IOTLB */ -#define AMDVI_IOTLB_MAX_SIZE 1024 -#define AMDVI_DEVID_SHIFT 36 +#define AMDVI_IOTLB_MAX_SIZE 1024 +#define AMDVI_GET_IOTLB_GFN(addr) (addr >> AMDVI_PAGE_SHIFT_4K) /* default extended feature */ #define AMDVI_DEFAULT_EXT_FEATURES \ From fde5930cc37175cfcd0f03a089e26f4458a52311 Mon Sep 17 00:00:00 2001 From: Albert Esteve Date: Fri, 17 Oct 2025 09:20:11 +0200 Subject: [PATCH 04/14] vhost-user: fix shared object lookup handler logic Refactor backend_read() function and add a reply_ack variable to have the option for handlers to force tweak whether they should send a reply or not without depending on VHOST_USER_NEED_REPLY_MASK flag. This fixes an issue with vhost_user_backend_handle_shared_object_lookup() logic, as the error path was not closing the backend channel correctly. So, we can remove the reply call from within the handler, make sure it returns early on errors as other handlers do and set the reply_ack variable on backend_read() to true to ensure that it will send a response, thus keeping the original intent. Fixes: 1609476662 ("vhost-user: add shared_object msg") Cc: qemu-stable@nongnu.org Signed-off-by: Albert Esteve Reviewed-by: Stefano Garzarella Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Message-Id: <20251017072011.1874874-2-aesteve@redhat.com> --- hw/virtio/vhost-user.c | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index aac98f898a..4b0fae12ae 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -1668,14 +1668,6 @@ static bool vhost_user_send_resp(QIOChannel *ioc, VhostUserHeader *hdr, return !qio_channel_writev_all(ioc, iov, ARRAY_SIZE(iov), errp); } -static bool -vhost_user_backend_send_dmabuf_fd(QIOChannel *ioc, VhostUserHeader *hdr, - VhostUserPayload *payload, Error **errp) -{ - hdr->size = sizeof(payload->u64); - return vhost_user_send_resp(ioc, hdr, payload, errp); -} - int vhost_user_get_shared_object(struct vhost_dev *dev, unsigned char *uuid, int *dmabuf_fd) { @@ -1716,19 +1708,15 @@ int vhost_user_get_shared_object(struct vhost_dev *dev, unsigned char *uuid, static int vhost_user_backend_handle_shared_object_lookup(struct vhost_user *u, - QIOChannel *ioc, - VhostUserHeader *hdr, - VhostUserPayload *payload) + VhostUserShared *object) { QemuUUID uuid; CharFrontend *chr = u->user->chr; - Error *local_err = NULL; int dmabuf_fd = -1; int fd_num = 0; - memcpy(uuid.data, payload->object.uuid, sizeof(payload->object.uuid)); + memcpy(uuid.data, object->uuid, sizeof(object->uuid)); - payload->u64 = 0; switch (virtio_object_type(&uuid)) { case TYPE_DMABUF: dmabuf_fd = virtio_lookup_dmabuf(&uuid); @@ -1737,18 +1725,16 @@ vhost_user_backend_handle_shared_object_lookup(struct vhost_user *u, { struct vhost_dev *dev = virtio_lookup_vhost_device(&uuid); if (dev == NULL) { - payload->u64 = -EINVAL; - break; + return -EINVAL; } int ret = vhost_user_get_shared_object(dev, uuid.data, &dmabuf_fd); if (ret < 0) { - payload->u64 = ret; + return ret; } break; } case TYPE_INVALID: - payload->u64 = -EINVAL; - break; + return -EINVAL; } if (dmabuf_fd != -1) { @@ -1757,11 +1743,6 @@ vhost_user_backend_handle_shared_object_lookup(struct vhost_user *u, if (qemu_chr_fe_set_msgfds(chr, &dmabuf_fd, fd_num) < 0) { error_report("Failed to set msg fds."); - payload->u64 = -EINVAL; - } - - if (!vhost_user_backend_send_dmabuf_fd(ioc, hdr, payload, &local_err)) { - error_report_err(local_err); return -EINVAL; } @@ -1790,6 +1771,7 @@ static gboolean backend_read(QIOChannel *ioc, GIOCondition condition, struct iovec iov; g_autofree int *fd = NULL; size_t fdsize = 0; + bool reply_ack; int i; /* Read header */ @@ -1808,6 +1790,8 @@ static gboolean backend_read(QIOChannel *ioc, GIOCondition condition, goto err; } + reply_ack = hdr.flags & VHOST_USER_NEED_REPLY_MASK; + /* Read payload */ if (qio_channel_read_all(ioc, (char *) &payload, hdr.size, &local_err)) { error_report_err(local_err); @@ -1833,8 +1817,10 @@ static gboolean backend_read(QIOChannel *ioc, GIOCondition condition, &payload.object); break; case VHOST_USER_BACKEND_SHARED_OBJECT_LOOKUP: - ret = vhost_user_backend_handle_shared_object_lookup(dev->opaque, ioc, - &hdr, &payload); + /* The backend always expects a response */ + reply_ack = true; + ret = vhost_user_backend_handle_shared_object_lookup(dev->opaque, + &payload.object); break; default: error_report("Received unexpected msg type: %d.", hdr.request); @@ -1845,7 +1831,7 @@ static gboolean backend_read(QIOChannel *ioc, GIOCondition condition, * REPLY_ACK feature handling. Other reply types has to be managed * directly in their request handlers. */ - if (hdr.flags & VHOST_USER_NEED_REPLY_MASK) { + if (reply_ack) { payload.u64 = !!ret; hdr.size = sizeof(payload.u64); From 43b8c4339709efef153cb2cf21d23d1791890d60 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Fri, 17 Oct 2025 05:36:00 -0400 Subject: [PATCH 05/14] intel_iommu: Handle PASID cache invalidation Adds a new entry VTDPASIDCacheEntry in VTDAddressSpace to cache the pasid entry and track PASID usage and future PASID tagged DMA address translation support in vIOMMU. When guest triggers pasid cache invalidation, QEMU will capture it and update or invalidate pasid cache. vIOMMU emulator could figure out the reason by fetching latest guest pasid entry in memory and compare it with cached PASID entry if it's valid. Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan Reviewed-by: Yi Liu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Message-Id: <20251017093602.525338-2-zhenzhong.duan@intel.com> --- hw/i386/intel_iommu.c | 141 ++++++++++++++++++++++++++++++--- hw/i386/intel_iommu_internal.h | 17 ++++ hw/i386/trace-events | 3 + include/hw/i386/intel_iommu.h | 6 ++ 4 files changed, 157 insertions(+), 10 deletions(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 6a168d5107..c47f13b659 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -3051,6 +3051,130 @@ static bool vtd_process_piotlb_desc(IntelIOMMUState *s, return true; } +static inline int vtd_dev_get_pe_from_pasid(VTDAddressSpace *vtd_as, + VTDPASIDEntry *pe) +{ + IntelIOMMUState *s = vtd_as->iommu_state; + VTDContextEntry ce; + int ret; + + if (!s->root_scalable) { + return -VTD_FR_RTADDR_INV_TTM; + } + + ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, + &ce); + if (ret) { + return ret; + } + + return vtd_ce_get_rid2pasid_entry(s, &ce, pe, vtd_as->pasid); +} + +/* Update or invalidate pasid cache based on the pasid entry in guest memory. */ +static void vtd_pasid_cache_sync_locked(gpointer key, gpointer value, + gpointer user_data) +{ + VTDPASIDCacheInfo *pc_info = user_data; + VTDAddressSpace *vtd_as = value; + VTDPASIDCacheEntry *pc_entry = &vtd_as->pasid_cache_entry; + VTDPASIDEntry pe; + uint16_t did; + + if (vtd_dev_get_pe_from_pasid(vtd_as, &pe)) { + /* + * No valid pasid entry in guest memory. e.g. pasid entry was modified + * to be either all-zero or non-present. Either case means existing + * pasid cache should be invalidated. + */ + pc_entry->valid = false; + return; + } + + /* + * VTD_INV_DESC_PASIDC_G_DSI and VTD_INV_DESC_PASIDC_G_PASID_SI require + * DID check. If DID doesn't match the value in cache or memory, then + * it's not a pasid entry we want to invalidate. + */ + switch (pc_info->type) { + case VTD_INV_DESC_PASIDC_G_PASID_SI: + if (pc_info->pasid != vtd_as->pasid) { + return; + } + /* Fall through */ + case VTD_INV_DESC_PASIDC_G_DSI: + if (pc_entry->valid) { + did = VTD_SM_PASID_ENTRY_DID(pc_entry->pasid_entry.val[1]); + } else { + did = VTD_SM_PASID_ENTRY_DID(pe.val[1]); + } + if (pc_info->did != did) { + return; + } + } + + pc_entry->pasid_entry = pe; + pc_entry->valid = true; +} + +static void vtd_pasid_cache_sync(IntelIOMMUState *s, VTDPASIDCacheInfo *pc_info) +{ + if (!s->root_scalable || !s->dmar_enabled) { + return; + } + + vtd_iommu_lock(s); + g_hash_table_foreach(s->vtd_address_spaces, vtd_pasid_cache_sync_locked, + pc_info); + vtd_iommu_unlock(s); +} + +static bool vtd_process_pasid_desc(IntelIOMMUState *s, + VTDInvDesc *inv_desc) +{ + uint16_t did; + uint32_t pasid; + VTDPASIDCacheInfo pc_info = {}; + uint64_t mask[4] = {VTD_INV_DESC_PASIDC_RSVD_VAL0, VTD_INV_DESC_ALL_ONE, + VTD_INV_DESC_ALL_ONE, VTD_INV_DESC_ALL_ONE}; + + if (!vtd_inv_desc_reserved_check(s, inv_desc, mask, true, + __func__, "pasid cache inv")) { + return false; + } + + did = VTD_INV_DESC_PASIDC_DID(inv_desc); + pasid = VTD_INV_DESC_PASIDC_PASID(inv_desc); + pc_info.type = VTD_INV_DESC_PASIDC_G(inv_desc); + + switch (pc_info.type) { + case VTD_INV_DESC_PASIDC_G_DSI: + trace_vtd_inv_desc_pasid_cache_dsi(did); + pc_info.did = did; + break; + + case VTD_INV_DESC_PASIDC_G_PASID_SI: + /* PASID selective implies a DID selective */ + trace_vtd_inv_desc_pasid_cache_psi(did, pasid); + pc_info.did = did; + pc_info.pasid = pasid ?: PCI_NO_PASID; + break; + + case VTD_INV_DESC_PASIDC_G_GLOBAL: + trace_vtd_inv_desc_pasid_cache_gsi(); + break; + + default: + error_report_once("invalid granularity field in PASID-cache invalidate " + "descriptor, hi: 0x%"PRIx64" lo: 0x%" PRIx64, + inv_desc->val[1], inv_desc->val[0]); + return false; + } + + vtd_pasid_cache_sync(s, &pc_info); + return true; +} + static bool vtd_process_inv_iec_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) { @@ -3266,6 +3390,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) } break; + case VTD_INV_DESC_PC: + trace_vtd_inv_desc("pasid-cache", inv_desc.val[1], inv_desc.val[0]); + if (!vtd_process_pasid_desc(s, &inv_desc)) { + return false; + } + break; + case VTD_INV_DESC_PIOTLB: trace_vtd_inv_desc("p-iotlb", inv_desc.val[1], inv_desc.val[0]); if (!vtd_process_piotlb_desc(s, &inv_desc)) { @@ -3308,16 +3439,6 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s) } break; - /* - * TODO: the entity of below two cases will be implemented in future series. - * To make guest (which integrates scalable mode support patch set in - * iommu driver) work, just return true is enough so far. - */ - case VTD_INV_DESC_PC: - if (s->scalable_mode) { - break; - } - /* fallthrough */ default: error_report_once("%s: invalid inv desc: hi=%"PRIx64", lo=%"PRIx64 " (unknown type)", __func__, inv_desc.hi, diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 0f6a1237e4..75bafdf0cd 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -316,6 +316,8 @@ typedef enum VTDFaultReason { * request while disabled */ VTD_FR_IR_SID_ERR = 0x26, /* Invalid Source-ID */ + VTD_FR_RTADDR_INV_TTM = 0x31, /* Invalid TTM in RTADDR */ + VTD_FR_SM_PRE_ABS = 0x47, /* SCT.8 : PRE bit in a present SM CE is 0 */ /* PASID directory entry access failure */ @@ -517,6 +519,15 @@ typedef union VTDPRDesc VTDPRDesc; #define VTD_INV_DESC_PIOTLB_RSVD_VAL0 0xfff000000000f1c0ULL #define VTD_INV_DESC_PIOTLB_RSVD_VAL1 0xf80ULL +/* PASID-cache Invalidate Descriptor (pc_inv_dsc) fields */ +#define VTD_INV_DESC_PASIDC_G(x) extract64((x)->val[0], 4, 2) +#define VTD_INV_DESC_PASIDC_G_DSI 0 +#define VTD_INV_DESC_PASIDC_G_PASID_SI 1 +#define VTD_INV_DESC_PASIDC_G_GLOBAL 3 +#define VTD_INV_DESC_PASIDC_DID(x) extract64((x)->val[0], 16, 16) +#define VTD_INV_DESC_PASIDC_PASID(x) extract64((x)->val[0], 32, 20) +#define VTD_INV_DESC_PASIDC_RSVD_VAL0 0xfff000000000f1c0ULL + /* Page Request Descriptor */ /* For the low 64-bit of 128-bit */ #define VTD_PRD_TYPE (1ULL) @@ -603,6 +614,12 @@ typedef struct VTDRootEntry VTDRootEntry; #define VTD_SM_CONTEXT_ENTRY_RSVD_VAL1 0xffffffffffe00000ULL #define VTD_SM_CONTEXT_ENTRY_PRE 0x10ULL +typedef struct VTDPASIDCacheInfo { + uint8_t type; + uint16_t did; + uint32_t pasid; +} VTDPASIDCacheInfo; + /* PASID Table Related Definitions */ #define VTD_PASID_DIR_BASE_ADDR_MASK (~0xfffULL) #define VTD_PASID_TABLE_BASE_ADDR_MASK (~0xfffULL) diff --git a/hw/i386/trace-events b/hw/i386/trace-events index ac9e1a10aa..298addb24d 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -24,6 +24,9 @@ vtd_inv_qi_head(uint16_t head) "read head %d" vtd_inv_qi_tail(uint16_t head) "write tail %d" vtd_inv_qi_fetch(void) "" vtd_context_cache_reset(void) "" +vtd_inv_desc_pasid_cache_gsi(void) "" +vtd_inv_desc_pasid_cache_dsi(uint16_t domain) "Domain selective PC invalidation domain 0x%"PRIx16 +vtd_inv_desc_pasid_cache_psi(uint16_t domain, uint32_t pasid) "PASID selective PC invalidation domain 0x%"PRIx16" pasid 0x%"PRIx32 vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present" vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8" devfn %"PRIu8" not present" vtd_iotlb_page_hit(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t domain) "IOTLB page hit sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" domain 0x%"PRIx16 diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index b2f1ef9595..ca7f7bb661 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -95,6 +95,11 @@ struct VTDPASIDEntry { uint64_t val[8]; }; +typedef struct VTDPASIDCacheEntry { + struct VTDPASIDEntry pasid_entry; + bool valid; +} VTDPASIDCacheEntry; + struct VTDAddressSpace { PCIBus *bus; uint8_t devfn; @@ -107,6 +112,7 @@ struct VTDAddressSpace { MemoryRegion iommu_ir_fault; /* Interrupt region for catching fault */ IntelIOMMUState *iommu_state; VTDContextCacheEntry context_cache_entry; + VTDPASIDCacheEntry pasid_cache_entry; QLIST_ENTRY(VTDAddressSpace) next; /* Superset of notifier flags that this address space has */ IOMMUNotifierFlag notifier_flags; From baff72ca2a67974cb19499e7f8b7d87675adc746 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Fri, 17 Oct 2025 05:36:01 -0400 Subject: [PATCH 06/14] intel_iommu: Reset pasid cache when system level reset Reset pasid cache when system level reset. Currently we don't have any device supporting PASID yet. So all are PASID_0, its vtd_as is allocated by PCI system and never removed, just mark pasid cache invalid. Signed-off-by: Yi Liu Signed-off-by: Zhenzhong Duan Reviewed-by: Yi Liu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Message-Id: <20251017093602.525338-3-zhenzhong.duan@intel.com> --- hw/i386/intel_iommu.c | 15 +++++++++++++++ hw/i386/trace-events | 1 + 2 files changed, 16 insertions(+) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index c47f13b659..cf0b62f29e 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -87,6 +87,20 @@ struct vtd_iotlb_key { static void vtd_address_space_refresh_all(IntelIOMMUState *s); static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n); +static void vtd_pasid_cache_reset_locked(IntelIOMMUState *s) +{ + VTDAddressSpace *vtd_as; + GHashTableIter as_it; + + trace_vtd_pasid_cache_reset(); + + g_hash_table_iter_init(&as_it, s->vtd_address_spaces); + while (g_hash_table_iter_next(&as_it, NULL, (void **)&vtd_as)) { + VTDPASIDCacheEntry *pc_entry = &vtd_as->pasid_cache_entry; + pc_entry->valid = false; + } +} + static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, uint64_t wmask, uint64_t w1cmask) { @@ -381,6 +395,7 @@ static void vtd_reset_caches(IntelIOMMUState *s) vtd_iommu_lock(s); vtd_reset_iotlb_locked(s); vtd_reset_context_cache_locked(s); + vtd_pasid_cache_reset_locked(s); vtd_iommu_unlock(s); } diff --git a/hw/i386/trace-events b/hw/i386/trace-events index 298addb24d..b704f4f90c 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -24,6 +24,7 @@ vtd_inv_qi_head(uint16_t head) "read head %d" vtd_inv_qi_tail(uint16_t head) "write tail %d" vtd_inv_qi_fetch(void) "" vtd_context_cache_reset(void) "" +vtd_pasid_cache_reset(void) "" vtd_inv_desc_pasid_cache_gsi(void) "" vtd_inv_desc_pasid_cache_dsi(uint16_t domain) "Domain selective PC invalidation domain 0x%"PRIx16 vtd_inv_desc_pasid_cache_psi(uint16_t domain, uint32_t pasid) "PASID selective PC invalidation domain 0x%"PRIx16" pasid 0x%"PRIx32 From c9cf33075a4c8eb299e0ac3bb67ad96490499ced Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Fri, 17 Oct 2025 05:36:02 -0400 Subject: [PATCH 07/14] intel_iommu: Fix DMA failure when guest switches IOMMU domain Kernel allows user to switch IOMMU domain, e.g., switch between DMA and identity domain. When this happen in IOMMU scalable mode, a pasid cache invalidation request is sent, this request is ignored by vIOMMU which leads to device binding to wrong address space, then DMA fails. This issue exists in scalable mode with both first stage and second stage translations, both emulated and passthrough devices. Take network device for example, below sequence trigger issue: 1. start a guest with iommu=pt 2. echo 0000:01:00.0 > /sys/bus/pci/drivers/virtio-pci/unbind 3. echo DMA > /sys/kernel/iommu_groups/6/type 4. echo 0000:01:00.0 > /sys/bus/pci/drivers/virtio-pci/bind 5. Ping test Fix it by switching address space in invalidation handler. Fixes: 4a4f219e8a10 ("intel_iommu: add scalable-mode option to make scalable mode work") Signed-off-by: Zhenzhong Duan Reviewed-by: Yi Liu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Message-Id: <20251017093602.525338-4-zhenzhong.duan@intel.com> --- hw/i386/intel_iommu.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index cf0b62f29e..78b142ccea 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -3086,6 +3086,11 @@ static inline int vtd_dev_get_pe_from_pasid(VTDAddressSpace *vtd_as, return vtd_ce_get_rid2pasid_entry(s, &ce, pe, vtd_as->pasid); } +static int vtd_pasid_entry_compare(VTDPASIDEntry *p1, VTDPASIDEntry *p2) +{ + return memcmp(p1, p2, sizeof(*p1)); +} + /* Update or invalidate pasid cache based on the pasid entry in guest memory. */ static void vtd_pasid_cache_sync_locked(gpointer key, gpointer value, gpointer user_data) @@ -3094,15 +3099,28 @@ static void vtd_pasid_cache_sync_locked(gpointer key, gpointer value, VTDAddressSpace *vtd_as = value; VTDPASIDCacheEntry *pc_entry = &vtd_as->pasid_cache_entry; VTDPASIDEntry pe; + IOMMUNotifier *n; uint16_t did; if (vtd_dev_get_pe_from_pasid(vtd_as, &pe)) { + if (!pc_entry->valid) { + return; + } /* * No valid pasid entry in guest memory. e.g. pasid entry was modified * to be either all-zero or non-present. Either case means existing * pasid cache should be invalidated. */ pc_entry->valid = false; + + /* + * When a pasid entry isn't valid any more, we should unmap all + * mappings in shadow pages instantly to ensure DMA security. + */ + IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { + vtd_address_space_unmap(vtd_as, n); + } + vtd_switch_address_space(vtd_as); return; } @@ -3128,8 +3146,15 @@ static void vtd_pasid_cache_sync_locked(gpointer key, gpointer value, } } - pc_entry->pasid_entry = pe; - pc_entry->valid = true; + if (!pc_entry->valid) { + pc_entry->pasid_entry = pe; + pc_entry->valid = true; + } else if (!vtd_pasid_entry_compare(&pe, &pc_entry->pasid_entry)) { + return; + } + + vtd_switch_address_space(vtd_as); + vtd_address_space_sync(vtd_as); } static void vtd_pasid_cache_sync(IntelIOMMUState *s, VTDPASIDCacheInfo *pc_info) From 1ba9a5220325dd5260a0c37b6299ce38364a5120 Mon Sep 17 00:00:00 2001 From: German Maglione Date: Wed, 22 Oct 2025 18:24:05 +0200 Subject: [PATCH 08/14] vhost-user: make vhost_set_vring_file() synchronous MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QEMU sends all of VHOST_USER_SET_VRING_KICK, _CALL, and _ERR without setting the NEED_REPLY flag, i.e. by the time the respective vhost_user_set_vring_*() function returns, it is completely up to chance whether the back-end has already processed the request and switched over to the new FD for interrupts. At least for vhost_user_set_vring_call(), that is a problem: It is called through vhost_virtqueue_mask(), which is generally used in the VirtioDeviceClass.guest_notifier_mask() implementation, which is in turn called by virtio_pci_one_vector_unmask(). The fact that we do not wait for the back-end to install the FD leads to a race there: Masking interrupts is implemented by redirecting interrupts to an internal event FD that is not connected to the guest. Unmasking then re-installs the guest-connected IRQ FD, then checks if there are pending interrupts left on the masked event FD, and if so, issues an interrupt to the guest. Because guest_notifier_mask() (through vhost_user_set_vring_call()) doesn't wait for the back-end to switch over to the actual IRQ FD, it's possible we check for pending interrupts while the back-end is still using the masked event FD, and then we will lose interrupts that occur before the back-end finally does switch over. Fix this by setting NEED_REPLY on those VHOST_USER_SET_VRING_* messages, so when we get that reply, we know that the back-end is now using the new FD. We have a few reports of a virtiofs mount hanging: - https://gitlab.com/virtio-fs/virtiofsd/-/issues/101 - https://gitlab.com/virtio-fs/virtiofsd/-/issues/133 - https://gitlab.com/virtio-fs/virtiofsd/-/issues/213 This is quite difficult bug to reproduce, even for the reporters. It only happens on production, every few weeks, and/or on 1 in 300 VMs. So, we are not 100% sure this fixes that issue. However, we think this is still a bug, and at least we have one report that claims this fixed the issue: https://gitlab.com/virtio-fs/virtiofsd/-/issues/133#note_2743209419 Fixes: 5f6f6664bf24 ("Add vhost-user as a vhost backend.") Signed-off-by: German Maglione Signed-off-by: Hanna Czenczek Reviewed-by: Eugenio Pérez Reviewed-by: Stefano Garzarella Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Message-Id: <20251022162405.318672-1-gmaglione@redhat.com> --- hw/virtio/vhost-user.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 4b0fae12ae..63fa9a1b4b 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -1327,8 +1327,11 @@ static int vhost_set_vring_file(struct vhost_dev *dev, VhostUserRequest request, struct vhost_vring_file *file) { + int ret; int fds[VHOST_USER_MAX_RAM_SLOTS]; size_t fd_num = 0; + bool reply_supported = virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_REPLY_ACK); VhostUserMsg msg = { .hdr.request = request, .hdr.flags = VHOST_USER_VERSION, @@ -1336,13 +1339,32 @@ static int vhost_set_vring_file(struct vhost_dev *dev, .hdr.size = sizeof(msg.payload.u64), }; + if (reply_supported) { + msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK; + } + if (file->fd > 0) { fds[fd_num++] = file->fd; } else { msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK; } - return vhost_user_write(dev, &msg, fds, fd_num); + ret = vhost_user_write(dev, &msg, fds, fd_num); + if (ret < 0) { + return ret; + } + + if (reply_supported) { + /* + * wait for the back-end's confirmation that the new FD is active, + * otherwise guest_notifier_mask() could check for pending interrupts + * while the back-end is still using the masked event FD, losing + * interrupts that occur before the back-end installs the FD + */ + return process_message_reply(dev, &msg); + } + + return 0; } static int vhost_user_set_vring_kick(struct vhost_dev *dev, From 3e6ad83f209ea8cf497e5d160e44c1bc5e518f1a Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 22 Oct 2025 09:06:37 +0100 Subject: [PATCH 09/14] tests/qtest/bios-tables-test: Prepare for _DSM change in the DSDT table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Subsequent patch will fix the GPEX _DSM method. Add the affected DSDT blobs to allowed-diff list for bios-table tests. Signed-off-by: Shameer Kolothum Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Message-Id: <20251022080639.243965-2-skolothumtho@nvidia.com> --- tests/qtest/bios-tables-test-allowed-diff.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index dfb8523c8b..e2fce2e972 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1 +1,17 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/aarch64/virt/DSDT", +"tests/data/acpi/aarch64/virt/DSDT.acpihmatvirt", +"tests/data/acpi/aarch64/virt/DSDT.memhp", +"tests/data/acpi/aarch64/virt/DSDT.pxb", +"tests/data/acpi/aarch64/virt/DSDT.topology", +"tests/data/acpi/aarch64/virt/DSDT.acpipcihp", +"tests/data/acpi/aarch64/virt/DSDT.hpoffacpiindex", +"tests/data/acpi/aarch64/virt/DSDT.viot", +"tests/data/acpi/aarch64/virt/DSDT.smmuv3-legacy", +"tests/data/acpi/aarch64/virt/DSDT.smmuv3-dev", +"tests/data/acpi/riscv64/virt/DSDT", +"tests/data/acpi/loongarch64/virt/DSDT", +"tests/data/acpi/loongarch64/virt/DSDT.topology", +"tests/data/acpi/loongarch64/virt/DSDT.numamem", +"tests/data/acpi/loongarch64/virt/DSDT.memhp", +"tests/data/acpi/x86/microvm/DSDT.pcie", From 325aa2d86a20786c308b0874d15a60d1b924bd0e Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Wed, 22 Oct 2025 09:06:38 +0100 Subject: [PATCH 10/14] hw/pci-host/gpex-acpi: Fix _DSM function 0 support return value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, only function 0 is supported. According to the ACPI Specification, Revision 6.6, Section 9.1.1 “_DSM (Device Specific Method)”, bit 0 should be 0 to indicate that no other functions are supported beyond function 0. The resulting AML change looks like this: Method (_DSM, 4, NotSerialized) // _DSM: Device-Specific Method { If ((Arg0 == ToUUID ("e5c937d0-3553-4d7a-9117-ea4d19c3434d") { If ((Arg2 == Zero)) { Return (Buffer (One) { - 0x01 // . + 0x00 // . }) } } } Fixes: 5b85eabe68f9 ("acpi: add acpi_dsdt_add_gpex") Signed-off-by: Eric Auger Signed-off-by: Shameer Kolothum Tested-by: Zhangfei Gao Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Message-Id: <20251022080639.243965-3-skolothumtho@nvidia.com> --- hw/pci-host/gpex-acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c index 952a0ace19..4587baeb78 100644 --- a/hw/pci-host/gpex-acpi.c +++ b/hw/pci-host/gpex-acpi.c @@ -64,7 +64,7 @@ static Aml *build_pci_host_bridge_dsm_method(void) UUID = aml_touuid("E5C937D0-3553-4D7A-9117-EA4D19C3434D"); ifctx = aml_if(aml_equal(aml_arg(0), UUID)); ifctx1 = aml_if(aml_equal(aml_arg(2), aml_int(0))); - uint8_t byte_list[1] = {1}; + uint8_t byte_list[1] = {0}; buf = aml_buffer(1, byte_list); aml_append(ifctx1, aml_return(buf)); aml_append(ifctx, ifctx1); From ccf166d89dcf57c9d333f05173851f90e416097a Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 22 Oct 2025 09:06:39 +0100 Subject: [PATCH 11/14] tests/qtest/bios-tables-test: Update DSDT blobs after GPEX _DSM change MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the reference DSDT blobs after GPEX _DSM change. This affects the aarch64 'virt', riscv64 "virt", loongarch64 "virt" and the x86 'microvm' machines. DSDT diff is the same for all the machines/tests:  /*   * Intel ACPI Component Architecture   * AML/ASL+ Disassembler version 20230628 (64-bit version)   * Copyright (c) 2000 - 2023 Intel Corporation   *   * Disassembling to symbolic ASL+ operators   * - * Disassembly of tests/data/acpi/aarch64/virt/DSDT, Fri Oct 10 11:18:21 2025 + * Disassembly of /tmp/aml-E6V9D3, Fri Oct 10 11:18:21 2025   *   * Original Table Header:   *     Signature        "DSDT"   *     Length           0x000014D9 (5337)   *     Revision         0x02 - *     Checksum         0xA4 + *     Checksum         0xA5   *     OEM ID           "BOCHS "   *     OEM Table ID     "BXPC    "   *     OEM Revision     0x00000001 (1)   *     Compiler ID      "BXPC"   *     Compiler Version 0x00000001 (1)   */  DefinitionBlock ("", "DSDT", 2, "BOCHS ", "BXPC    ", 0x00000001)  {      Scope (\_SB)      {          Device (C000)          {              Name (_HID, "ACPI0007" /* Processor Device */)  // _HID: Hardware ID              Name (_UID, Zero)  // _UID: Unique ID          } @@ -1822,33 +1822,33 @@                  Else                  {                      CDW1 |= 0x04                  }                  Return (Arg3)              }              Method (_DSM, 4, NotSerialized)  // _DSM: Device-Specific Method              {                  If ((Arg0 == ToUUID ("e5c937d0-3553-4d7a-9117-ea4d19c3434d") /* Device Labeling Interface */))                  {                      If ((Arg2 == Zero))                      {                          Return (Buffer (One)                          { -                             0x01                                             // . +                             0x00                                             // .                          })                      }                  }                  Return (Buffer (One)                  {                       0x00                                             // .                  })              }              Device (RES0)              {                  Name (_HID, "PNP0C02" /* PNP Motherboard Resources */)  // _HID: Hardware ID                  Name (_CRS, ResourceTemplate ()  // _CRS: Current Resource Settings                  {                      QWordMemory (ResourceProducer, PosDecode, MinFixed, MaxFixed, NonCacheable, ReadWrite, Signed-off-by: Shameer Kolothum Reviewed-by: Eric Auger Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Message-Id: <20251022080639.243965-4-skolothumtho@nvidia.com> --- tests/data/acpi/aarch64/virt/DSDT | Bin 5337 -> 5337 bytes .../data/acpi/aarch64/virt/DSDT.acpihmatvirt | Bin 5423 -> 5423 bytes tests/data/acpi/aarch64/virt/DSDT.acpipcihp | Bin 6246 -> 6246 bytes .../acpi/aarch64/virt/DSDT.hpoffacpiindex | Bin 5391 -> 5391 bytes tests/data/acpi/aarch64/virt/DSDT.memhp | Bin 6698 -> 6698 bytes tests/data/acpi/aarch64/virt/DSDT.pxb | Bin 7812 -> 7812 bytes tests/data/acpi/aarch64/virt/DSDT.smmuv3-dev | Bin 10274 -> 10274 bytes .../data/acpi/aarch64/virt/DSDT.smmuv3-legacy | Bin 10274 -> 10274 bytes tests/data/acpi/aarch64/virt/DSDT.topology | Bin 5539 -> 5539 bytes tests/data/acpi/aarch64/virt/DSDT.viot | Bin 5354 -> 5354 bytes tests/data/acpi/loongarch64/virt/DSDT | Bin 4603 -> 4603 bytes tests/data/acpi/loongarch64/virt/DSDT.memhp | Bin 5824 -> 5824 bytes tests/data/acpi/loongarch64/virt/DSDT.numamem | Bin 4609 -> 4609 bytes .../data/acpi/loongarch64/virt/DSDT.topology | Bin 4905 -> 4905 bytes tests/data/acpi/riscv64/virt/DSDT | Bin 3538 -> 3538 bytes tests/data/acpi/x86/microvm/DSDT.pcie | Bin 2985 -> 2985 bytes tests/qtest/bios-tables-test-allowed-diff.h | 16 ---------------- 17 files changed, 16 deletions(-) diff --git a/tests/data/acpi/aarch64/virt/DSDT b/tests/data/acpi/aarch64/virt/DSDT index 38f01adb61e6e4704821cee5e397888bb6b7e46d..35a862e44714d26ded01d40dc147e76cc73a1c84 100644 GIT binary patch delta 26 icmcbqc~g_iCDeu3O33dtL7iC~z^4Z8`Aeu3O33dtL7iC~z^4`d0AB>ef3t>iv&CbHl*#KgY2MquK delta 26 hcmZ2wvdVGDP{3t>ja&CbHl*#KgS2MquK diff --git a/tests/data/acpi/aarch64/virt/DSDT.pxb b/tests/data/acpi/aarch64/virt/DSDT.pxb index 71c632cedcca63a77a4cdde53d9bc392102687b6..f98dcbfc6b823bce6d5710e8056a4e260fb92a02 100644 GIT binary patch delta 33 pcmZp%ZL#HY33dr-kz-(B65YtPONfzS^HCu#E=GpU7i6|D0sxh>2=D*^ delta 33 pcmZp%ZL#HY33dr-kz-(B65hzQONfzi^HCu#E=I=97i6|D0sxhu2=D*^ diff --git a/tests/data/acpi/aarch64/virt/DSDT.smmuv3-dev b/tests/data/acpi/aarch64/virt/DSDT.smmuv3-dev index e8c2b376df7bddc2392945ea8cbb550b3d3b5e26..6c12a7aaf8a6315bac968a685f5b6673e7248817 100644 GIT binary patch delta 55 zcmZ1!uqc4bCDiv&CbHh83A%X2VVdH delta 26 hcmaE*`AU<^CDja&CbHh83A%R2VVdH diff --git a/tests/data/acpi/loongarch64/virt/DSDT b/tests/data/acpi/loongarch64/virt/DSDT index 55aa34f988d6ef69293e91c5fe45bee0a02bc5f1..09aa903c4e875f541223e36f59b28e101599df20 100644 GIT binary patch delta 26 icmeyZ{9BpJCDVb2A==` delta 26 hcmZovX;k5I33dr#6k=dte72EmG7lr;=6O8HtN>VV2A==` diff --git a/tests/data/acpi/loongarch64/virt/DSDT.topology b/tests/data/acpi/loongarch64/virt/DSDT.topology index b2afebc938ce45d798c8aa5f45a463f1617e257e..7ab23f47cc82dd7bc1975e17893a8cd61039e66d 100644 GIT binary patch delta 26 icmZ3fwo;ADCD Date: Fri, 10 Oct 2025 16:12:57 +0200 Subject: [PATCH 12/14] virtio-net: Advertise UDP tunnel GSO support by default Allow bidirectional aggregated traffic for UDP encapsulated flows. Add the needed compatibility entries to avoid migration issues vs older QEMU instances. Signed-off-by: Paolo Abeni Acked-by: Jason Wang Tested-by: Lei Yang Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Message-Id: <9c500fbcd2cf29afd1826b1ac906f9d5beac3601.1760104079.git.pabeni@redhat.com> --- hw/core/machine.c | 4 ++++ hw/net/virtio-net.c | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index 0580550e12..06e0c9a179 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -40,6 +40,10 @@ GlobalProperty hw_compat_10_1[] = { { TYPE_ACPI_GED, "x-has-hest-addr", "false" }, + { TYPE_VIRTIO_NET, "host_tunnel", "off" }, + { TYPE_VIRTIO_NET, "host_tunnel_csum", "off" }, + { TYPE_VIRTIO_NET, "guest_tunnel", "off" }, + { TYPE_VIRTIO_NET, "guest_tunnel_csum", "off" }, }; const size_t hw_compat_10_1_len = G_N_ELEMENTS(hw_compat_10_1); diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 17ed0ef919..3b85560f6f 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -4299,19 +4299,19 @@ static const Property virtio_net_properties[] = { VIRTIO_DEFINE_PROP_FEATURE("host_tunnel", VirtIONet, host_features_ex, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO, - false), + true), VIRTIO_DEFINE_PROP_FEATURE("host_tunnel_csum", VirtIONet, host_features_ex, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM, - false), + true), VIRTIO_DEFINE_PROP_FEATURE("guest_tunnel", VirtIONet, host_features_ex, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO, - false), + true), VIRTIO_DEFINE_PROP_FEATURE("guest_tunnel_csum", VirtIONet, host_features_ex, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM, - false), + true), }; static void virtio_net_class_init(ObjectClass *klass, const void *data) From fa4136387928749d5a1b4fa3606ae0ce6dce75aa Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Thu, 6 Nov 2025 11:56:40 +0100 Subject: [PATCH 13/14] q35: increase default tseg size With virtual machines becoming larger (more CPUs, more memory) the memory needed by the SMM code in OVMF to manage page tables and vcpu state grows too. Default SMM memory (aka TSEG) size is 16 MB, and this often is not enough. Bump it to 64 MB for new machine types. Signed-off-by: Gerd Hoffmann Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Message-Id: <20251106105640.1642109-1-kraxel@redhat.com> --- hw/i386/pc.c | 4 +++- hw/pci-host/q35.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 4d6bcbb846..f8b919cb6c 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -81,7 +81,9 @@ { "qemu64-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, },\ { "athlon-" TYPE_X86_CPU, "model-id", "QEMU Virtual CPU version " v, }, -GlobalProperty pc_compat_10_1[] = {}; +GlobalProperty pc_compat_10_1[] = { + { "mch", "extended-tseg-mbytes", "16" }, +}; const size_t pc_compat_10_1_len = G_N_ELEMENTS(pc_compat_10_1); GlobalProperty pc_compat_10_0[] = { diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c index 1951ae440c..a708758d36 100644 --- a/hw/pci-host/q35.c +++ b/hw/pci-host/q35.c @@ -663,7 +663,7 @@ static void mch_realize(PCIDevice *d, Error **errp) static const Property mch_props[] = { DEFINE_PROP_UINT16("extended-tseg-mbytes", MCHPCIState, ext_tseg_mbytes, - 16), + 64), DEFINE_PROP_BOOL("smbase-smram", MCHPCIState, has_smram_at_smbase, true), }; From 97f24a0496be9e0a7216fea1fa0d54c1db9066e2 Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Thu, 6 Nov 2025 20:21:05 +0100 Subject: [PATCH 14/14] vhost-user.rst: clarify when FDs can be sent Previously the spec did not say where in a message the FDs should be sent. As I understand it, FDs transferred in ancillary data will always be received along with the first byte of the data they were sent with, so we should define which byte that is. Going by both libvhost-user in QEMU and the rust-vmm crate, that byte is the first byte of the message header. This is important to specify because it would make back-end implementation significantly more complicated if receiving file descriptors in the middle of a message had to be handled. Signed-off-by: Alyssa Ross Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Message-Id: <20251106192105.3456755-1-hi@alyssa.is> --- docs/interop/vhost-user.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst index 2e50f2ddfa..93a9c8df2b 100644 --- a/docs/interop/vhost-user.rst +++ b/docs/interop/vhost-user.rst @@ -411,6 +411,13 @@ in the ancillary data: * ``VHOST_USER_SET_INFLIGHT_FD`` (if ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD``) * ``VHOST_USER_SET_DEVICE_STATE_FD`` +When sending file descriptors in ancilliary data, *front-end* should +associate the ancilliary data with a ``sendmsg`` operation (or +equivalent) that sends bytes starting with the first byte of the +message header. *back-end* can therefore expect that file descriptors +will only be received in the first ``recvmsg`` operation for a message +header. + If *front-end* is unable to send the full message or receives a wrong reply it will close the connection. An optional reconnection mechanism can be implemented.