From e1e2909f8e74051a34a044940f90d4650b6e784a Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:09:44 +0100 Subject: [PATCH 01/28] hw/i386/pc_piix.c: restrict isapc machine to 32-bit CPUs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The isapc machine represents a legacy ISA PC with a 486 CPU. Whilst it is possible to specify any CPU via -cpu on the command line, it makes no sense to allow modern 64-bit CPUs to be used. Restrict the isapc machine to the available 32-bit CPUs, taking care to handle the case where if a user inadvertently uses either -cpu max or -cpu host then the "best" 32-bit CPU is used (in this case the pentium3). Signed-off-by: Mark Cave-Ayland Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20250828111057.468712-2-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/pc_piix.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index d165ac72ed..8f5fb3cf90 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -436,6 +436,31 @@ static void pc_set_south_bridge(Object *obj, int value, Error **errp) #ifdef CONFIG_ISAPC static void pc_init_isa(MachineState *machine) { + /* + * There is a small chance that someone unintentionally passes "-cpu max" + * for the isapc machine, which will provide a much more modern 32-bit + * CPU than would be expected for an ISA-era PC. If the "max" cpu type has + * been specified, choose the "best" 32-bit cpu possible which we consider + * be the pentium3 (deliberately choosing an Intel CPU given that the + * default 486 CPU for the isapc machine is also an Intel CPU). + */ + if (!strcmp(machine->cpu_type, X86_CPU_TYPE_NAME("max"))) { + machine->cpu_type = X86_CPU_TYPE_NAME("pentium3"); + warn_report("-cpu max is invalid for isapc machine, using pentium3"); + } + + /* + * Similarly if someone unintentionally passes "-cpu host" for the isapc + * machine then display a warning and also switch to the "best" 32-bit + * cpu possible which we consider to be the pentium3. This is because any + * host CPU will already be modern than this, but it also ensures any + * newer CPU flags/features are filtered out for older guests. + */ + if (!strcmp(machine->cpu_type, X86_CPU_TYPE_NAME("host"))) { + machine->cpu_type = X86_CPU_TYPE_NAME("pentium3"); + warn_report("-cpu host is invalid for isapc machine, using pentium3"); + } + pc_init1(machine, NULL); } #endif @@ -815,7 +840,20 @@ DEFINE_I440FX_MACHINE(2, 6); #ifdef CONFIG_ISAPC static void isapc_machine_options(MachineClass *m) { + static const char * const valid_cpu_types[] = { + X86_CPU_TYPE_NAME("486"), + X86_CPU_TYPE_NAME("athlon"), + X86_CPU_TYPE_NAME("kvm32"), + X86_CPU_TYPE_NAME("pentium"), + X86_CPU_TYPE_NAME("pentium2"), + X86_CPU_TYPE_NAME("pentium3"), + X86_CPU_TYPE_NAME("qemu32"), + X86_CPU_TYPE_NAME("max"), + X86_CPU_TYPE_NAME("host"), + NULL + }; PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + m->desc = "ISA-only PC"; m->max_cpus = 1; m->option_rom_has_mr = true; @@ -828,6 +866,7 @@ static void isapc_machine_options(MachineClass *m) pcmc->has_reserved_memory = false; m->default_nic = "ne2k_isa"; m->default_cpu_type = X86_CPU_TYPE_NAME("486"); + m->valid_cpu_types = valid_cpu_types; m->no_floppy = !module_object_class_by_name(TYPE_ISA_FDC); m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL); } From 483a232e0431f19a4d6596be59c1d51370407249 Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:09:45 +0100 Subject: [PATCH 02/28] hw/i386/pc_piix.c: restrict isapc machine to 3.5G memory Since the isapc machine is now limited to using 32-bit CPUs, add a hard restriction so that the machine cannot be started with more than 3.5G memory. This matches the default value for max_ram_below_4g if not specified and provides consistent behaviour betweem TCG and KVM accelerators. Signed-off-by: Mark Cave-Ayland Link: https://lore.kernel.org/r/20250828111057.468712-3-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/pc_piix.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 8f5fb3cf90..9a3b5d88f0 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -461,6 +461,12 @@ static void pc_init_isa(MachineState *machine) warn_report("-cpu host is invalid for isapc machine, using pentium3"); } + if (machine->ram_size > 3.5 * GiB) { + error_report("Too much memory for this machine: %" PRId64 " MiB, " + "maximum 3584 MiB", machine->ram_size / MiB); + exit(1); + } + pc_init1(machine, NULL); } #endif From b55eab382cfbeb11f0afe116a06243d3fe5e43d9 Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:09:46 +0100 Subject: [PATCH 03/28] hw/i386/pc_piix.c: remove include for loader.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This header is not required since the loader functionality is handled separately by pc_memory_init() in pc.c. Signed-off-by: Mark Cave-Ayland Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250828111057.468712-4-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/pc_piix.c | 1 - 1 file changed, 1 deletion(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 9a3b5d88f0..351986232d 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -28,7 +28,6 @@ #include "qemu/units.h" #include "hw/char/parallel-isa.h" #include "hw/dma/i8257.h" -#include "hw/loader.h" #include "hw/i386/x86.h" #include "hw/i386/pc.h" #include "hw/i386/apic.h" From 79233a7e600ff26c623aecee81aa9a04cbbc7668 Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:09:47 +0100 Subject: [PATCH 04/28] hw/i386/pc_piix.c: inline pc_xen_hvm_init_pci() into pc_xen_hvm_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helps to simplify the initialisation of the Xen hvm machine. Signed-off-by: Philippe Mathieu-Daudé Signed-off-by: Mark Cave-Ayland Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250828111057.468712-5-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/pc_piix.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 351986232d..8e302dc013 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -471,14 +471,6 @@ static void pc_init_isa(MachineState *machine) #endif #ifdef CONFIG_XEN -static void pc_xen_hvm_init_pci(MachineState *machine) -{ - const char *pci_type = xen_igd_gfx_pt_enabled() ? - TYPE_IGD_PASSTHROUGH_I440FX_PCI_DEVICE : TYPE_I440FX_PCI_DEVICE; - - pc_init1(machine, pci_type); -} - static void pc_xen_hvm_init(MachineState *machine) { PCMachineState *pcms = PC_MACHINE(machine); @@ -488,7 +480,10 @@ static void pc_xen_hvm_init(MachineState *machine) exit(1); } - pc_xen_hvm_init_pci(machine); + pc_init1(machine, xen_igd_gfx_pt_enabled() + ? TYPE_IGD_PASSTHROUGH_I440FX_PCI_DEVICE + : TYPE_I440FX_PCI_DEVICE); + xen_igd_reserve_slot(pcms->pcibus); pci_create_simple(pcms->pcibus, -1, "xen-platform"); } From 469be2f11f7279fe9174199183cf51ba1f557e2d Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:09:48 +0100 Subject: [PATCH 05/28] hw/i386/pc_piix.c: duplicate pc_init1() into pc_isa_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is to prepare for splitting the isapc machine into its own separate file. Signed-off-by: Mark Cave-Ayland Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20250828111057.468712-6-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/pc_piix.c | 275 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 274 insertions(+), 1 deletion(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 8e302dc013..60bf18c680 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -435,6 +435,23 @@ static void pc_set_south_bridge(Object *obj, int value, Error **errp) #ifdef CONFIG_ISAPC static void pc_init_isa(MachineState *machine) { + const char *pci_type = NULL; + PCMachineState *pcms = PC_MACHINE(machine); + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + X86MachineState *x86ms = X86_MACHINE(machine); + MemoryRegion *system_memory = get_system_memory(); + MemoryRegion *system_io = get_system_io(); + Object *phb = NULL; + ISABus *isa_bus; + Object *piix4_pm = NULL; + qemu_irq smi_irq; + GSIState *gsi_state; + MemoryRegion *ram_memory; + MemoryRegion *pci_memory = NULL; + MemoryRegion *rom_memory = system_memory; + ram_addr_t lowmem; + uint64_t hole64_size = 0; + /* * There is a small chance that someone unintentionally passes "-cpu max" * for the isapc machine, which will provide a much more modern 32-bit @@ -466,7 +483,263 @@ static void pc_init_isa(MachineState *machine) exit(1); } - pc_init1(machine, NULL); + /* + * Calculate ram split, for memory below and above 4G. It's a bit + * complicated for backward compatibility reasons ... + * + * - Traditional split is 3.5G (lowmem = 0xe0000000). This is the + * default value for max_ram_below_4g now. + * + * - Then, to gigabyte align the memory, we move the split to 3G + * (lowmem = 0xc0000000). But only in case we have to split in + * the first place, i.e. ram_size is larger than (traditional) + * lowmem. And for new machine types (gigabyte_align = true) + * only, for live migration compatibility reasons. + * + * - Next the max-ram-below-4g option was added, which allowed to + * reduce lowmem to a smaller value, to allow a larger PCI I/O + * window below 4G. qemu doesn't enforce gigabyte alignment here, + * but prints a warning. + * + * - Finally max-ram-below-4g got updated to also allow raising lowmem, + * so legacy non-PAE guests can get as much memory as possible in + * the 32bit address space below 4G. + * + * - Note that Xen has its own ram setup code in xen_ram_init(), + * called via xen_hvm_init_pc(). + * + * Examples: + * qemu -M pc-1.7 -m 4G (old default) -> 3584M low, 512M high + * qemu -M pc -m 4G (new default) -> 3072M low, 1024M high + * qemu -M pc,max-ram-below-4g=2G -m 4G -> 2048M low, 2048M high + * qemu -M pc,max-ram-below-4g=4G -m 3968M -> 3968M low (=4G-128M) + */ + if (xen_enabled()) { + xen_hvm_init_pc(pcms, &ram_memory); + } else { + ram_memory = machine->ram; + if (!pcms->max_ram_below_4g) { + pcms->max_ram_below_4g = 0xe0000000; /* default: 3.5G */ + } + lowmem = pcms->max_ram_below_4g; + if (machine->ram_size >= pcms->max_ram_below_4g) { + if (pcmc->gigabyte_align) { + if (lowmem > 0xc0000000) { + lowmem = 0xc0000000; + } + if (lowmem & (1 * GiB - 1)) { + warn_report("Large machine and max_ram_below_4g " + "(%" PRIu64 ") not a multiple of 1G; " + "possible bad performance.", + pcms->max_ram_below_4g); + } + } + } + + if (machine->ram_size >= lowmem) { + x86ms->above_4g_mem_size = machine->ram_size - lowmem; + x86ms->below_4g_mem_size = lowmem; + } else { + x86ms->above_4g_mem_size = 0; + x86ms->below_4g_mem_size = machine->ram_size; + } + } + + pc_machine_init_sgx_epc(pcms); + x86_cpus_init(x86ms, pcmc->default_cpu_version); + + if (kvm_enabled()) { + kvmclock_create(pcmc->kvmclock_create_always); + } + + if (pcmc->pci_enabled) { + pci_memory = g_new(MemoryRegion, 1); + memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); + rom_memory = pci_memory; + + phb = OBJECT(qdev_new(TYPE_I440FX_PCI_HOST_BRIDGE)); + object_property_add_child(OBJECT(machine), "i440fx", phb); + object_property_set_link(phb, PCI_HOST_PROP_RAM_MEM, + OBJECT(ram_memory), &error_fatal); + object_property_set_link(phb, PCI_HOST_PROP_PCI_MEM, + OBJECT(pci_memory), &error_fatal); + object_property_set_link(phb, PCI_HOST_PROP_SYSTEM_MEM, + OBJECT(system_memory), &error_fatal); + object_property_set_link(phb, PCI_HOST_PROP_IO_MEM, + OBJECT(system_io), &error_fatal); + object_property_set_uint(phb, PCI_HOST_BELOW_4G_MEM_SIZE, + x86ms->below_4g_mem_size, &error_fatal); + object_property_set_uint(phb, PCI_HOST_ABOVE_4G_MEM_SIZE, + x86ms->above_4g_mem_size, &error_fatal); + object_property_set_str(phb, I440FX_HOST_PROP_PCI_TYPE, pci_type, + &error_fatal); + sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal); + + pcms->pcibus = PCI_BUS(qdev_get_child_bus(DEVICE(phb), "pci.0")); + pci_bus_map_irqs(pcms->pcibus, + xen_enabled() ? xen_pci_slot_get_pirq + : pc_pci_slot_get_pirq); + + hole64_size = object_property_get_uint(phb, + PCI_HOST_PROP_PCI_HOLE64_SIZE, + &error_abort); + } + + /* allocate ram and load rom/bios */ + if (!xen_enabled()) { + pc_memory_init(pcms, system_memory, rom_memory, hole64_size); + } else { + assert(machine->ram_size == x86ms->below_4g_mem_size + + x86ms->above_4g_mem_size); + + pc_system_flash_cleanup_unused(pcms); + if (machine->kernel_filename != NULL) { + /* For xen HVM direct kernel boot, load linux here */ + xen_load_linux(pcms); + } + } + + gsi_state = pc_gsi_create(&x86ms->gsi, pcmc->pci_enabled); + + if (pcmc->pci_enabled) { + PCIDevice *pci_dev; + DeviceState *dev; + size_t i; + + pci_dev = pci_new_multifunction(-1, pcms->south_bridge); + object_property_set_bool(OBJECT(pci_dev), "has-usb", + machine_usb(machine), &error_abort); + object_property_set_bool(OBJECT(pci_dev), "has-acpi", + x86_machine_is_acpi_enabled(x86ms), + &error_abort); + object_property_set_bool(OBJECT(pci_dev), "has-pic", false, + &error_abort); + object_property_set_bool(OBJECT(pci_dev), "has-pit", false, + &error_abort); + qdev_prop_set_uint32(DEVICE(pci_dev), "smb_io_base", 0xb100); + object_property_set_bool(OBJECT(pci_dev), "smm-enabled", + x86_machine_is_smm_enabled(x86ms), + &error_abort); + dev = DEVICE(pci_dev); + for (i = 0; i < ISA_NUM_IRQS; i++) { + qdev_connect_gpio_out_named(dev, "isa-irqs", i, x86ms->gsi[i]); + } + pci_realize_and_unref(pci_dev, pcms->pcibus, &error_fatal); + + if (xen_enabled()) { + pci_device_set_intx_routing_notifier( + pci_dev, piix_intx_routing_notifier_xen); + + /* + * Xen supports additional interrupt routes from the PCI devices to + * the IOAPIC: the four pins of each PCI device on the bus are also + * connected to the IOAPIC directly. + * These additional routes can be discovered through ACPI. + */ + pci_bus_irqs(pcms->pcibus, xen_intx_set_irq, pci_dev, + XEN_IOAPIC_NUM_PIRQS); + } + + isa_bus = ISA_BUS(qdev_get_child_bus(DEVICE(pci_dev), "isa.0")); + x86ms->rtc = ISA_DEVICE(object_resolve_path_component(OBJECT(pci_dev), + "rtc")); + piix4_pm = object_resolve_path_component(OBJECT(pci_dev), "pm"); + dev = DEVICE(object_resolve_path_component(OBJECT(pci_dev), "ide")); + pci_ide_create_devs(PCI_DEVICE(dev)); + pcms->idebus[0] = qdev_get_child_bus(dev, "ide.0"); + pcms->idebus[1] = qdev_get_child_bus(dev, "ide.1"); + } else { + uint32_t irq; + + isa_bus = isa_bus_new(NULL, system_memory, system_io, + &error_abort); + isa_bus_register_input_irqs(isa_bus, x86ms->gsi); + + x86ms->rtc = isa_new(TYPE_MC146818_RTC); + qdev_prop_set_int32(DEVICE(x86ms->rtc), "base_year", 2000); + isa_realize_and_unref(x86ms->rtc, isa_bus, &error_fatal); + irq = object_property_get_uint(OBJECT(x86ms->rtc), "irq", + &error_fatal); + isa_connect_gpio_out(ISA_DEVICE(x86ms->rtc), 0, irq); + + i8257_dma_init(OBJECT(machine), isa_bus, 0); + pcms->hpet_enabled = false; + } + + if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) { + pc_i8259_create(isa_bus, gsi_state->i8259_irq); + } + + if (phb) { + ioapic_init_gsi(gsi_state, phb); + } + + if (tcg_enabled()) { + x86_register_ferr_irq(x86ms->gsi[13]); + } + + pc_vga_init(isa_bus, pcmc->pci_enabled ? pcms->pcibus : NULL); + + /* init basic PC hardware */ + pc_basic_device_init(pcms, isa_bus, x86ms->gsi, x86ms->rtc, + !MACHINE_CLASS(pcmc)->no_floppy, 0x4); + + pc_nic_init(pcmc, isa_bus, pcms->pcibus); + +#ifdef CONFIG_IDE_ISA + if (!pcmc->pci_enabled) { + DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS]; + int i; + + ide_drive_get(hd, ARRAY_SIZE(hd)); + for (i = 0; i < MAX_IDE_BUS; i++) { + ISADevice *dev; + char busname[] = "ide.0"; + dev = isa_ide_init(isa_bus, ide_iobase[i], ide_iobase2[i], + ide_irq[i], + hd[MAX_IDE_DEVS * i], hd[MAX_IDE_DEVS * i + 1]); + /* + * The ide bus name is ide.0 for the first bus and ide.1 for the + * second one. + */ + busname[4] = '0' + i; + pcms->idebus[i] = qdev_get_child_bus(DEVICE(dev), busname); + } + } +#endif + + if (piix4_pm) { + smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0); + + qdev_connect_gpio_out_named(DEVICE(piix4_pm), "smi-irq", 0, smi_irq); + pcms->smbus = I2C_BUS(qdev_get_child_bus(DEVICE(piix4_pm), "i2c")); + /* TODO: Populate SPD eeprom data. */ + smbus_eeprom_init(pcms->smbus, 8, NULL, 0); + + object_property_add_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP, + TYPE_HOTPLUG_HANDLER, + (Object **)&x86ms->acpi_dev, + object_property_allow_set_link, + OBJ_PROP_LINK_STRONG); + object_property_set_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP, + piix4_pm, &error_abort); + } + + if (machine->nvdimms_state->is_enabled) { + nvdimm_init_acpi_state(machine->nvdimms_state, system_io, + x86_nvdimm_acpi_dsmio, + x86ms->fw_cfg, OBJECT(pcms)); + } + +#if defined(CONFIG_IGVM) + /* Apply guest state from IGVM if supplied */ + if (x86ms->igvm) { + if (IGVM_CFG_GET_CLASS(x86ms->igvm) + ->process(x86ms->igvm, machine->cgs, false, &error_fatal) < 0) { + g_assert_not_reached(); + } + } +#endif } #endif From ba5500e0385fad2dd1d4878872695023e5e32e92 Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:09:49 +0100 Subject: [PATCH 06/28] hw/i386/pc_piix.c: remove pcmc->pci_enabled dependent initialisation from pc_init_isa() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCI code will never be used for an isapc machine. Signed-off-by: Mark Cave-Ayland Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250828111057.468712-7-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/pc_piix.c | 120 ++++++---------------------------------------- 1 file changed, 15 insertions(+), 105 deletions(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 60bf18c680..f1b4468d0a 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -435,19 +435,17 @@ static void pc_set_south_bridge(Object *obj, int value, Error **errp) #ifdef CONFIG_ISAPC static void pc_init_isa(MachineState *machine) { - const char *pci_type = NULL; PCMachineState *pcms = PC_MACHINE(machine); PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); X86MachineState *x86ms = X86_MACHINE(machine); MemoryRegion *system_memory = get_system_memory(); MemoryRegion *system_io = get_system_io(); - Object *phb = NULL; ISABus *isa_bus; Object *piix4_pm = NULL; qemu_irq smi_irq; + uint32_t irq; GSIState *gsi_state; MemoryRegion *ram_memory; - MemoryRegion *pci_memory = NULL; MemoryRegion *rom_memory = system_memory; ram_addr_t lowmem; uint64_t hole64_size = 0; @@ -552,39 +550,6 @@ static void pc_init_isa(MachineState *machine) kvmclock_create(pcmc->kvmclock_create_always); } - if (pcmc->pci_enabled) { - pci_memory = g_new(MemoryRegion, 1); - memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); - rom_memory = pci_memory; - - phb = OBJECT(qdev_new(TYPE_I440FX_PCI_HOST_BRIDGE)); - object_property_add_child(OBJECT(machine), "i440fx", phb); - object_property_set_link(phb, PCI_HOST_PROP_RAM_MEM, - OBJECT(ram_memory), &error_fatal); - object_property_set_link(phb, PCI_HOST_PROP_PCI_MEM, - OBJECT(pci_memory), &error_fatal); - object_property_set_link(phb, PCI_HOST_PROP_SYSTEM_MEM, - OBJECT(system_memory), &error_fatal); - object_property_set_link(phb, PCI_HOST_PROP_IO_MEM, - OBJECT(system_io), &error_fatal); - object_property_set_uint(phb, PCI_HOST_BELOW_4G_MEM_SIZE, - x86ms->below_4g_mem_size, &error_fatal); - object_property_set_uint(phb, PCI_HOST_ABOVE_4G_MEM_SIZE, - x86ms->above_4g_mem_size, &error_fatal); - object_property_set_str(phb, I440FX_HOST_PROP_PCI_TYPE, pci_type, - &error_fatal); - sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal); - - pcms->pcibus = PCI_BUS(qdev_get_child_bus(DEVICE(phb), "pci.0")); - pci_bus_map_irqs(pcms->pcibus, - xen_enabled() ? xen_pci_slot_get_pirq - : pc_pci_slot_get_pirq); - - hole64_size = object_property_get_uint(phb, - PCI_HOST_PROP_PCI_HOLE64_SIZE, - &error_abort); - } - /* allocate ram and load rom/bios */ if (!xen_enabled()) { pc_memory_init(pcms, system_memory, rom_memory, hole64_size); @@ -599,92 +564,37 @@ static void pc_init_isa(MachineState *machine) } } - gsi_state = pc_gsi_create(&x86ms->gsi, pcmc->pci_enabled); + gsi_state = pc_gsi_create(&x86ms->gsi, false); - if (pcmc->pci_enabled) { - PCIDevice *pci_dev; - DeviceState *dev; - size_t i; + isa_bus = isa_bus_new(NULL, system_memory, system_io, + &error_abort); + isa_bus_register_input_irqs(isa_bus, x86ms->gsi); - pci_dev = pci_new_multifunction(-1, pcms->south_bridge); - object_property_set_bool(OBJECT(pci_dev), "has-usb", - machine_usb(machine), &error_abort); - object_property_set_bool(OBJECT(pci_dev), "has-acpi", - x86_machine_is_acpi_enabled(x86ms), - &error_abort); - object_property_set_bool(OBJECT(pci_dev), "has-pic", false, - &error_abort); - object_property_set_bool(OBJECT(pci_dev), "has-pit", false, - &error_abort); - qdev_prop_set_uint32(DEVICE(pci_dev), "smb_io_base", 0xb100); - object_property_set_bool(OBJECT(pci_dev), "smm-enabled", - x86_machine_is_smm_enabled(x86ms), - &error_abort); - dev = DEVICE(pci_dev); - for (i = 0; i < ISA_NUM_IRQS; i++) { - qdev_connect_gpio_out_named(dev, "isa-irqs", i, x86ms->gsi[i]); - } - pci_realize_and_unref(pci_dev, pcms->pcibus, &error_fatal); + x86ms->rtc = isa_new(TYPE_MC146818_RTC); + qdev_prop_set_int32(DEVICE(x86ms->rtc), "base_year", 2000); + isa_realize_and_unref(x86ms->rtc, isa_bus, &error_fatal); + irq = object_property_get_uint(OBJECT(x86ms->rtc), "irq", + &error_fatal); + isa_connect_gpio_out(ISA_DEVICE(x86ms->rtc), 0, irq); - if (xen_enabled()) { - pci_device_set_intx_routing_notifier( - pci_dev, piix_intx_routing_notifier_xen); - - /* - * Xen supports additional interrupt routes from the PCI devices to - * the IOAPIC: the four pins of each PCI device on the bus are also - * connected to the IOAPIC directly. - * These additional routes can be discovered through ACPI. - */ - pci_bus_irqs(pcms->pcibus, xen_intx_set_irq, pci_dev, - XEN_IOAPIC_NUM_PIRQS); - } - - isa_bus = ISA_BUS(qdev_get_child_bus(DEVICE(pci_dev), "isa.0")); - x86ms->rtc = ISA_DEVICE(object_resolve_path_component(OBJECT(pci_dev), - "rtc")); - piix4_pm = object_resolve_path_component(OBJECT(pci_dev), "pm"); - dev = DEVICE(object_resolve_path_component(OBJECT(pci_dev), "ide")); - pci_ide_create_devs(PCI_DEVICE(dev)); - pcms->idebus[0] = qdev_get_child_bus(dev, "ide.0"); - pcms->idebus[1] = qdev_get_child_bus(dev, "ide.1"); - } else { - uint32_t irq; - - isa_bus = isa_bus_new(NULL, system_memory, system_io, - &error_abort); - isa_bus_register_input_irqs(isa_bus, x86ms->gsi); - - x86ms->rtc = isa_new(TYPE_MC146818_RTC); - qdev_prop_set_int32(DEVICE(x86ms->rtc), "base_year", 2000); - isa_realize_and_unref(x86ms->rtc, isa_bus, &error_fatal); - irq = object_property_get_uint(OBJECT(x86ms->rtc), "irq", - &error_fatal); - isa_connect_gpio_out(ISA_DEVICE(x86ms->rtc), 0, irq); - - i8257_dma_init(OBJECT(machine), isa_bus, 0); - pcms->hpet_enabled = false; - } + i8257_dma_init(OBJECT(machine), isa_bus, 0); + pcms->hpet_enabled = false; if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) { pc_i8259_create(isa_bus, gsi_state->i8259_irq); } - if (phb) { - ioapic_init_gsi(gsi_state, phb); - } - if (tcg_enabled()) { x86_register_ferr_irq(x86ms->gsi[13]); } - pc_vga_init(isa_bus, pcmc->pci_enabled ? pcms->pcibus : NULL); + pc_vga_init(isa_bus, NULL); /* init basic PC hardware */ pc_basic_device_init(pcms, isa_bus, x86ms->gsi, x86ms->rtc, !MACHINE_CLASS(pcmc)->no_floppy, 0x4); - pc_nic_init(pcmc, isa_bus, pcms->pcibus); + pc_nic_init(pcmc, isa_bus, NULL); #ifdef CONFIG_IDE_ISA if (!pcmc->pci_enabled) { From dc58530f0a58fe09862026ab6c26c68c00f4d535 Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:09:50 +0100 Subject: [PATCH 07/28] hw/i386/pc_piix.c: remove igvm initialisation from pc_init_isa() According to the QEMU documentation igvm is only supported for the pc and q35 machines so remove igvm support from the isapc machine. Signed-off-by: Mark Cave-Ayland Link: https://lore.kernel.org/r/20250828111057.468712-8-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/pc_piix.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index f1b4468d0a..5ae265bd53 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -640,16 +640,6 @@ static void pc_init_isa(MachineState *machine) x86_nvdimm_acpi_dsmio, x86ms->fw_cfg, OBJECT(pcms)); } - -#if defined(CONFIG_IGVM) - /* Apply guest state from IGVM if supplied */ - if (x86ms->igvm) { - if (IGVM_CFG_GET_CLASS(x86ms->igvm) - ->process(x86ms->igvm, machine->cgs, false, &error_fatal) < 0) { - g_assert_not_reached(); - } - } -#endif } #endif From ae4199af92d38fffd0d1fb9f02eaeb0632eff6df Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:09:51 +0100 Subject: [PATCH 08/28] hw/i386/pc_piix.c: remove SMI and piix4_pm initialisation from pc_init_isa() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These are based upon the PIIX4 PCI chipset and so can never be used on an isapc machine. Signed-off-by: Mark Cave-Ayland Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250828111057.468712-9-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/pc_piix.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 5ae265bd53..57b02da5a8 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -441,8 +441,6 @@ static void pc_init_isa(MachineState *machine) MemoryRegion *system_memory = get_system_memory(); MemoryRegion *system_io = get_system_io(); ISABus *isa_bus; - Object *piix4_pm = NULL; - qemu_irq smi_irq; uint32_t irq; GSIState *gsi_state; MemoryRegion *ram_memory; @@ -618,23 +616,6 @@ static void pc_init_isa(MachineState *machine) } #endif - if (piix4_pm) { - smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0); - - qdev_connect_gpio_out_named(DEVICE(piix4_pm), "smi-irq", 0, smi_irq); - pcms->smbus = I2C_BUS(qdev_get_child_bus(DEVICE(piix4_pm), "i2c")); - /* TODO: Populate SPD eeprom data. */ - smbus_eeprom_init(pcms->smbus, 8, NULL, 0); - - object_property_add_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP, - TYPE_HOTPLUG_HANDLER, - (Object **)&x86ms->acpi_dev, - object_property_allow_set_link, - OBJ_PROP_LINK_STRONG); - object_property_set_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP, - piix4_pm, &error_abort); - } - if (machine->nvdimms_state->is_enabled) { nvdimm_init_acpi_state(machine->nvdimms_state, system_io, x86_nvdimm_acpi_dsmio, From d7916f6d5ec346d05ef63f5419d97a4d9f7d0a75 Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:09:52 +0100 Subject: [PATCH 09/28] hw/i386/pc_piix.c: remove SGX initialisation from pc_init_isa() The Intel SGX instructions only exist on recent CPUs and so would never be available on a CPU from the pre-PCI era. Signed-off-by: Mark Cave-Ayland Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250828111057.468712-10-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/pc_piix.c | 1 - 1 file changed, 1 deletion(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 57b02da5a8..9a2eee8ab0 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -541,7 +541,6 @@ static void pc_init_isa(MachineState *machine) } } - pc_machine_init_sgx_epc(pcms); x86_cpus_init(x86ms, pcmc->default_cpu_version); if (kvm_enabled()) { From 62f8d562bb86cd7e1a0ce06f191c402a1eeba309 Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:09:53 +0100 Subject: [PATCH 10/28] hw/i386/pc_piix.c: remove nvdimm initialisation from pc_init_isa() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NVDIMMs cannot be used by PCs from a pre-PCI era. Signed-off-by: Mark Cave-Ayland Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250828111057.468712-11-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/pc_piix.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 9a2eee8ab0..daf63a326b 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -614,12 +614,6 @@ static void pc_init_isa(MachineState *machine) } } #endif - - if (machine->nvdimms_state->is_enabled) { - nvdimm_init_acpi_state(machine->nvdimms_state, system_io, - x86_nvdimm_acpi_dsmio, - x86ms->fw_cfg, OBJECT(pcms)); - } } #endif From f2096fa151cbdf6cd169a6f0be9c5ccb5cd10466 Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:09:54 +0100 Subject: [PATCH 11/28] hw/i386/pc_piix.c: simplify RAM size logic in pc_init_isa() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All isapc machines must have 32-bit CPUs and so the RAM split logic can be hardcoded accordingly. Signed-off-by: Mark Cave-Ayland Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20250828111057.468712-12-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/pc_piix.c | 58 ++++------------------------------------------- 1 file changed, 4 insertions(+), 54 deletions(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index daf63a326b..0bc033943c 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -445,7 +445,6 @@ static void pc_init_isa(MachineState *machine) GSIState *gsi_state; MemoryRegion *ram_memory; MemoryRegion *rom_memory = system_memory; - ram_addr_t lowmem; uint64_t hole64_size = 0; /* @@ -480,65 +479,16 @@ static void pc_init_isa(MachineState *machine) } /* - * Calculate ram split, for memory below and above 4G. It's a bit - * complicated for backward compatibility reasons ... - * - * - Traditional split is 3.5G (lowmem = 0xe0000000). This is the - * default value for max_ram_below_4g now. - * - * - Then, to gigabyte align the memory, we move the split to 3G - * (lowmem = 0xc0000000). But only in case we have to split in - * the first place, i.e. ram_size is larger than (traditional) - * lowmem. And for new machine types (gigabyte_align = true) - * only, for live migration compatibility reasons. - * - * - Next the max-ram-below-4g option was added, which allowed to - * reduce lowmem to a smaller value, to allow a larger PCI I/O - * window below 4G. qemu doesn't enforce gigabyte alignment here, - * but prints a warning. - * - * - Finally max-ram-below-4g got updated to also allow raising lowmem, - * so legacy non-PAE guests can get as much memory as possible in - * the 32bit address space below 4G. - * - * - Note that Xen has its own ram setup code in xen_ram_init(), - * called via xen_hvm_init_pc(). - * - * Examples: - * qemu -M pc-1.7 -m 4G (old default) -> 3584M low, 512M high - * qemu -M pc -m 4G (new default) -> 3072M low, 1024M high - * qemu -M pc,max-ram-below-4g=2G -m 4G -> 2048M low, 2048M high - * qemu -M pc,max-ram-below-4g=4G -m 3968M -> 3968M low (=4G-128M) + * There is no RAM split for the isapc machine */ if (xen_enabled()) { xen_hvm_init_pc(pcms, &ram_memory); } else { ram_memory = machine->ram; - if (!pcms->max_ram_below_4g) { - pcms->max_ram_below_4g = 0xe0000000; /* default: 3.5G */ - } - lowmem = pcms->max_ram_below_4g; - if (machine->ram_size >= pcms->max_ram_below_4g) { - if (pcmc->gigabyte_align) { - if (lowmem > 0xc0000000) { - lowmem = 0xc0000000; - } - if (lowmem & (1 * GiB - 1)) { - warn_report("Large machine and max_ram_below_4g " - "(%" PRIu64 ") not a multiple of 1G; " - "possible bad performance.", - pcms->max_ram_below_4g); - } - } - } - if (machine->ram_size >= lowmem) { - x86ms->above_4g_mem_size = machine->ram_size - lowmem; - x86ms->below_4g_mem_size = lowmem; - } else { - x86ms->above_4g_mem_size = 0; - x86ms->below_4g_mem_size = machine->ram_size; - } + pcms->max_ram_below_4g = 3.5 * GiB; + x86ms->above_4g_mem_size = 0; + x86ms->below_4g_mem_size = machine->ram_size; } x86_cpus_init(x86ms, pcmc->default_cpu_version); From 20fd284ec3c6ec39fb8e062d52f1ee514e89c554 Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:09:55 +0100 Subject: [PATCH 12/28] hw/i386/pc_piix.c: hardcode hole64_size to 0 in pc_init_isa() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All isapc machines must have 32-bit CPUs and have no PCI 64-bit hole so it can be hardcoded to 0. Signed-off-by: Mark Cave-Ayland Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250828111057.468712-13-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/pc_piix.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 0bc033943c..66dc4a5186 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -445,7 +445,6 @@ static void pc_init_isa(MachineState *machine) GSIState *gsi_state; MemoryRegion *ram_memory; MemoryRegion *rom_memory = system_memory; - uint64_t hole64_size = 0; /* * There is a small chance that someone unintentionally passes "-cpu max" @@ -499,7 +498,7 @@ static void pc_init_isa(MachineState *machine) /* allocate ram and load rom/bios */ if (!xen_enabled()) { - pc_memory_init(pcms, system_memory, rom_memory, hole64_size); + pc_memory_init(pcms, system_memory, rom_memory, 0); } else { assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size); From b11ad71e32441baff354cdab1993847b61923570 Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:09:56 +0100 Subject: [PATCH 13/28] hw/i386/pc_piix.c: remove pc_system_flash_cleanup_unused() from pc_init_isa() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function contains 'assert(PC_MACHINE_GET_CLASS(pcms)->pci_enabled)' and so we can safely assume that it should never be used for the isapc machine. Signed-off-by: Mark Cave-Ayland Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20250828111057.468712-14-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/pc_piix.c | 1 - 1 file changed, 1 deletion(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 66dc4a5186..fb936748bd 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -503,7 +503,6 @@ static void pc_init_isa(MachineState *machine) assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size); - pc_system_flash_cleanup_unused(pcms); if (machine->kernel_filename != NULL) { /* For xen HVM direct kernel boot, load linux here */ xen_load_linux(pcms); From 32c73eb73c7c09290a642c0c3ea541ce00350994 Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:09:57 +0100 Subject: [PATCH 14/28] hw/i386/pc_piix.c: always initialise ISA IDE drives in pc_init_isa() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By definition an isapc machine must always use ISA IDE drives so ensure that they are always enabled. At the same time also remove the surrounding CONFIG_IDE_ISA define since it will be enabled via the ISAPC Kconfig. Signed-off-by: Mark Cave-Ayland Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250828111057.468712-15-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/pc_piix.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index fb936748bd..72ddd9b149 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -445,6 +445,8 @@ static void pc_init_isa(MachineState *machine) GSIState *gsi_state; MemoryRegion *ram_memory; MemoryRegion *rom_memory = system_memory; + DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS]; + int i; /* * There is a small chance that someone unintentionally passes "-cpu max" @@ -541,27 +543,20 @@ static void pc_init_isa(MachineState *machine) pc_nic_init(pcmc, isa_bus, NULL); -#ifdef CONFIG_IDE_ISA - if (!pcmc->pci_enabled) { - DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS]; - int i; - - ide_drive_get(hd, ARRAY_SIZE(hd)); - for (i = 0; i < MAX_IDE_BUS; i++) { - ISADevice *dev; - char busname[] = "ide.0"; - dev = isa_ide_init(isa_bus, ide_iobase[i], ide_iobase2[i], - ide_irq[i], - hd[MAX_IDE_DEVS * i], hd[MAX_IDE_DEVS * i + 1]); - /* - * The ide bus name is ide.0 for the first bus and ide.1 for the - * second one. - */ - busname[4] = '0' + i; - pcms->idebus[i] = qdev_get_child_bus(DEVICE(dev), busname); - } + ide_drive_get(hd, ARRAY_SIZE(hd)); + for (i = 0; i < MAX_IDE_BUS; i++) { + ISADevice *dev; + char busname[] = "ide.0"; + dev = isa_ide_init(isa_bus, ide_iobase[i], ide_iobase2[i], + ide_irq[i], + hd[MAX_IDE_DEVS * i], hd[MAX_IDE_DEVS * i + 1]); + /* + * The ide bus name is ide.0 for the first bus and ide.1 for the + * second one. + */ + busname[4] = '0' + i; + pcms->idebus[i] = qdev_get_child_bus(DEVICE(dev), busname); } -#endif } #endif From 99d0630a454581eeb2bfabdc0bc15cc07d145876 Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:09:58 +0100 Subject: [PATCH 15/28] hw/i386/pc_piix.c: assume pcmc->pci_enabled is always true in pc_init1() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCI is always enabled on the pc-i440fx machine so hardcode the relevant logic in pc_init1(). Add an assert() to ensure that this is always the case at runtime as already done in pc_q35_init(). Signed-off-by: Mark Cave-Ayland Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250828111057.468712-16-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/pc_piix.c | 192 ++++++++++++++++++---------------------------- 1 file changed, 76 insertions(+), 116 deletions(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 72ddd9b149..3ea77b2c44 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -125,6 +125,11 @@ static void pc_init1(MachineState *machine, const char *pci_type) MemoryRegion *rom_memory = system_memory; ram_addr_t lowmem; uint64_t hole64_size = 0; + PCIDevice *pci_dev; + DeviceState *dev; + size_t i; + + assert(pcmc->pci_enabled); /* * Calculate ram split, for memory below and above 4G. It's a bit @@ -195,38 +200,36 @@ static void pc_init1(MachineState *machine, const char *pci_type) kvmclock_create(pcmc->kvmclock_create_always); } - if (pcmc->pci_enabled) { - pci_memory = g_new(MemoryRegion, 1); - memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); - rom_memory = pci_memory; + pci_memory = g_new(MemoryRegion, 1); + memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); + rom_memory = pci_memory; - phb = OBJECT(qdev_new(TYPE_I440FX_PCI_HOST_BRIDGE)); - object_property_add_child(OBJECT(machine), "i440fx", phb); - object_property_set_link(phb, PCI_HOST_PROP_RAM_MEM, - OBJECT(ram_memory), &error_fatal); - object_property_set_link(phb, PCI_HOST_PROP_PCI_MEM, - OBJECT(pci_memory), &error_fatal); - object_property_set_link(phb, PCI_HOST_PROP_SYSTEM_MEM, - OBJECT(system_memory), &error_fatal); - object_property_set_link(phb, PCI_HOST_PROP_IO_MEM, - OBJECT(system_io), &error_fatal); - object_property_set_uint(phb, PCI_HOST_BELOW_4G_MEM_SIZE, - x86ms->below_4g_mem_size, &error_fatal); - object_property_set_uint(phb, PCI_HOST_ABOVE_4G_MEM_SIZE, - x86ms->above_4g_mem_size, &error_fatal); - object_property_set_str(phb, I440FX_HOST_PROP_PCI_TYPE, pci_type, - &error_fatal); - sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal); + phb = OBJECT(qdev_new(TYPE_I440FX_PCI_HOST_BRIDGE)); + object_property_add_child(OBJECT(machine), "i440fx", phb); + object_property_set_link(phb, PCI_HOST_PROP_RAM_MEM, + OBJECT(ram_memory), &error_fatal); + object_property_set_link(phb, PCI_HOST_PROP_PCI_MEM, + OBJECT(pci_memory), &error_fatal); + object_property_set_link(phb, PCI_HOST_PROP_SYSTEM_MEM, + OBJECT(system_memory), &error_fatal); + object_property_set_link(phb, PCI_HOST_PROP_IO_MEM, + OBJECT(system_io), &error_fatal); + object_property_set_uint(phb, PCI_HOST_BELOW_4G_MEM_SIZE, + x86ms->below_4g_mem_size, &error_fatal); + object_property_set_uint(phb, PCI_HOST_ABOVE_4G_MEM_SIZE, + x86ms->above_4g_mem_size, &error_fatal); + object_property_set_str(phb, I440FX_HOST_PROP_PCI_TYPE, pci_type, + &error_fatal); + sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal); - pcms->pcibus = PCI_BUS(qdev_get_child_bus(DEVICE(phb), "pci.0")); - pci_bus_map_irqs(pcms->pcibus, - xen_enabled() ? xen_pci_slot_get_pirq - : pc_pci_slot_get_pirq); + pcms->pcibus = PCI_BUS(qdev_get_child_bus(DEVICE(phb), "pci.0")); + pci_bus_map_irqs(pcms->pcibus, + xen_enabled() ? xen_pci_slot_get_pirq + : pc_pci_slot_get_pirq); - hole64_size = object_property_get_uint(phb, - PCI_HOST_PROP_PCI_HOLE64_SIZE, - &error_abort); - } + hole64_size = object_property_get_uint(phb, + PCI_HOST_PROP_PCI_HOLE64_SIZE, + &error_abort); /* allocate ram and load rom/bios */ if (!xen_enabled()) { @@ -242,72 +245,51 @@ static void pc_init1(MachineState *machine, const char *pci_type) } } - gsi_state = pc_gsi_create(&x86ms->gsi, pcmc->pci_enabled); + gsi_state = pc_gsi_create(&x86ms->gsi, true); - if (pcmc->pci_enabled) { - PCIDevice *pci_dev; - DeviceState *dev; - size_t i; - - pci_dev = pci_new_multifunction(-1, pcms->south_bridge); - object_property_set_bool(OBJECT(pci_dev), "has-usb", - machine_usb(machine), &error_abort); - object_property_set_bool(OBJECT(pci_dev), "has-acpi", - x86_machine_is_acpi_enabled(x86ms), - &error_abort); - object_property_set_bool(OBJECT(pci_dev), "has-pic", false, - &error_abort); - object_property_set_bool(OBJECT(pci_dev), "has-pit", false, - &error_abort); - qdev_prop_set_uint32(DEVICE(pci_dev), "smb_io_base", 0xb100); - object_property_set_bool(OBJECT(pci_dev), "smm-enabled", - x86_machine_is_smm_enabled(x86ms), - &error_abort); - dev = DEVICE(pci_dev); - for (i = 0; i < ISA_NUM_IRQS; i++) { - qdev_connect_gpio_out_named(dev, "isa-irqs", i, x86ms->gsi[i]); - } - pci_realize_and_unref(pci_dev, pcms->pcibus, &error_fatal); - - if (xen_enabled()) { - pci_device_set_intx_routing_notifier( - pci_dev, piix_intx_routing_notifier_xen); - - /* - * Xen supports additional interrupt routes from the PCI devices to - * the IOAPIC: the four pins of each PCI device on the bus are also - * connected to the IOAPIC directly. - * These additional routes can be discovered through ACPI. - */ - pci_bus_irqs(pcms->pcibus, xen_intx_set_irq, pci_dev, - XEN_IOAPIC_NUM_PIRQS); - } - - isa_bus = ISA_BUS(qdev_get_child_bus(DEVICE(pci_dev), "isa.0")); - x86ms->rtc = ISA_DEVICE(object_resolve_path_component(OBJECT(pci_dev), - "rtc")); - piix4_pm = object_resolve_path_component(OBJECT(pci_dev), "pm"); - dev = DEVICE(object_resolve_path_component(OBJECT(pci_dev), "ide")); - pci_ide_create_devs(PCI_DEVICE(dev)); - pcms->idebus[0] = qdev_get_child_bus(dev, "ide.0"); - pcms->idebus[1] = qdev_get_child_bus(dev, "ide.1"); - } else { - uint32_t irq; - - isa_bus = isa_bus_new(NULL, system_memory, system_io, - &error_abort); - isa_bus_register_input_irqs(isa_bus, x86ms->gsi); - - x86ms->rtc = isa_new(TYPE_MC146818_RTC); - qdev_prop_set_int32(DEVICE(x86ms->rtc), "base_year", 2000); - isa_realize_and_unref(x86ms->rtc, isa_bus, &error_fatal); - irq = object_property_get_uint(OBJECT(x86ms->rtc), "irq", - &error_fatal); - isa_connect_gpio_out(ISA_DEVICE(x86ms->rtc), 0, irq); - - i8257_dma_init(OBJECT(machine), isa_bus, 0); - pcms->hpet_enabled = false; + pci_dev = pci_new_multifunction(-1, pcms->south_bridge); + object_property_set_bool(OBJECT(pci_dev), "has-usb", + machine_usb(machine), &error_abort); + object_property_set_bool(OBJECT(pci_dev), "has-acpi", + x86_machine_is_acpi_enabled(x86ms), + &error_abort); + object_property_set_bool(OBJECT(pci_dev), "has-pic", false, + &error_abort); + object_property_set_bool(OBJECT(pci_dev), "has-pit", false, + &error_abort); + qdev_prop_set_uint32(DEVICE(pci_dev), "smb_io_base", 0xb100); + object_property_set_bool(OBJECT(pci_dev), "smm-enabled", + x86_machine_is_smm_enabled(x86ms), + &error_abort); + dev = DEVICE(pci_dev); + for (i = 0; i < ISA_NUM_IRQS; i++) { + qdev_connect_gpio_out_named(dev, "isa-irqs", i, x86ms->gsi[i]); } + pci_realize_and_unref(pci_dev, pcms->pcibus, &error_fatal); + + if (xen_enabled()) { + pci_device_set_intx_routing_notifier( + pci_dev, piix_intx_routing_notifier_xen); + + /* + * Xen supports additional interrupt routes from the PCI devices to + * the IOAPIC: the four pins of each PCI device on the bus are also + * connected to the IOAPIC directly. + * These additional routes can be discovered through ACPI. + */ + pci_bus_irqs(pcms->pcibus, xen_intx_set_irq, pci_dev, + XEN_IOAPIC_NUM_PIRQS); + } + + isa_bus = ISA_BUS(qdev_get_child_bus(DEVICE(pci_dev), "isa.0")); + x86ms->rtc = ISA_DEVICE(object_resolve_path_component(OBJECT(pci_dev), + "rtc")); + piix4_pm = object_resolve_path_component(OBJECT(pci_dev), "pm"); + dev = DEVICE(object_resolve_path_component(OBJECT(pci_dev), "ide")); + pci_ide_create_devs(PCI_DEVICE(dev)); + pcms->idebus[0] = qdev_get_child_bus(dev, "ide.0"); + pcms->idebus[1] = qdev_get_child_bus(dev, "ide.1"); + if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) { pc_i8259_create(isa_bus, gsi_state->i8259_irq); @@ -321,7 +303,7 @@ static void pc_init1(MachineState *machine, const char *pci_type) x86_register_ferr_irq(x86ms->gsi[13]); } - pc_vga_init(isa_bus, pcmc->pci_enabled ? pcms->pcibus : NULL); + pc_vga_init(isa_bus, pcms->pcibus); /* init basic PC hardware */ pc_basic_device_init(pcms, isa_bus, x86ms->gsi, x86ms->rtc, @@ -329,28 +311,6 @@ static void pc_init1(MachineState *machine, const char *pci_type) pc_nic_init(pcmc, isa_bus, pcms->pcibus); -#ifdef CONFIG_IDE_ISA - if (!pcmc->pci_enabled) { - DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS]; - int i; - - ide_drive_get(hd, ARRAY_SIZE(hd)); - for (i = 0; i < MAX_IDE_BUS; i++) { - ISADevice *dev; - char busname[] = "ide.0"; - dev = isa_ide_init(isa_bus, ide_iobase[i], ide_iobase2[i], - ide_irq[i], - hd[MAX_IDE_DEVS * i], hd[MAX_IDE_DEVS * i + 1]); - /* - * The ide bus name is ide.0 for the first bus and ide.1 for the - * second one. - */ - busname[4] = '0' + i; - pcms->idebus[i] = qdev_get_child_bus(DEVICE(dev), busname); - } - } -#endif - if (piix4_pm) { smi_irq = qemu_allocate_irq(pc_acpi_smi_interrupt, first_cpu, 0); From 3113e7db1d5d683e91cdbb4796e1b154cdda73bf Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:09:59 +0100 Subject: [PATCH 16/28] hw/i386: move isapc machine to separate isapc.c file Now that pc_init_isa() is independent of any PCI initialisation, move it into a separate isapc.c file including the ISA IDE variables which are now no longer needed for the pc-i440fx machine. This enables us to finally fix the dependency of ISAPC on I440FX in hw/i386/Kconfig. Note that as part of the move to a separate file we can see that the licence text is a verbatim copy of the MIT licence. The text originates from commit 1df912cf9e ("VL license of the day is MIT/BSD") so we can be sure that this was the original intent. As a consequence we can update the file header to use a SPDX tag as per the current project contribution guidelines. Signed-off-by: Mark Cave-Ayland Reviewed-by: Bernhard Beschow Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250828111057.468712-17-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/Kconfig | 3 - hw/i386/isapc.c | 190 ++++++++++++++++++++++++++++++++++++++++++++ hw/i386/meson.build | 1 + hw/i386/pc_piix.c | 172 --------------------------------------- 4 files changed, 191 insertions(+), 175 deletions(-) create mode 100644 hw/i386/isapc.c diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig index 3a0e2b8ebb..6a0ab54bea 100644 --- a/hw/i386/Kconfig +++ b/hw/i386/Kconfig @@ -96,9 +96,6 @@ config ISAPC select ISA_BUS select PC select IDE_ISA - # FIXME: it is in the same file as i440fx, and does not compile - # if separated - depends on I440FX config Q35 bool diff --git a/hw/i386/isapc.c b/hw/i386/isapc.c new file mode 100644 index 0000000000..300d64b7ad --- /dev/null +++ b/hw/i386/isapc.c @@ -0,0 +1,190 @@ +/* + * QEMU PC System Emulator + * + * Copyright (c) 2003-2004 Fabrice Bellard + * + * SPDX-License-Identifier: MIT + */ + +#include "qemu/osdep.h" + +#include "qemu/units.h" +#include "qemu/error-report.h" +#include "hw/char/parallel-isa.h" +#include "hw/dma/i8257.h" +#include "hw/i386/pc.h" +#include "hw/ide/isa.h" +#include "hw/ide/ide-bus.h" +#include "system/kvm.h" +#include "hw/i386/kvm/clock.h" +#include "hw/xen/xen-x86.h" +#include "system/xen.h" +#include "hw/rtc/mc146818rtc.h" +#include "target/i386/cpu.h" + +static const int ide_iobase[MAX_IDE_BUS] = { 0x1f0, 0x170 }; +static const int ide_iobase2[MAX_IDE_BUS] = { 0x3f6, 0x376 }; +static const int ide_irq[MAX_IDE_BUS] = { 14, 15 }; + + +static void pc_init_isa(MachineState *machine) +{ + PCMachineState *pcms = PC_MACHINE(machine); + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + X86MachineState *x86ms = X86_MACHINE(machine); + MemoryRegion *system_memory = get_system_memory(); + MemoryRegion *system_io = get_system_io(); + ISABus *isa_bus; + uint32_t irq; + GSIState *gsi_state; + MemoryRegion *ram_memory; + MemoryRegion *rom_memory = system_memory; + DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS]; + int i; + + /* + * There is a small chance that someone unintentionally passes "-cpu max" + * for the isapc machine, which will provide a much more modern 32-bit + * CPU than would be expected for an ISA-era PC. If the "max" cpu type has + * been specified, choose the "best" 32-bit cpu possible which we consider + * be the pentium3 (deliberately choosing an Intel CPU given that the + * default 486 CPU for the isapc machine is also an Intel CPU). + */ + if (!strcmp(machine->cpu_type, X86_CPU_TYPE_NAME("max"))) { + machine->cpu_type = X86_CPU_TYPE_NAME("pentium3"); + warn_report("-cpu max is invalid for isapc machine, using pentium3"); + } + + /* + * Similarly if someone unintentionally passes "-cpu host" for the isapc + * machine then display a warning and also switch to the "best" 32-bit + * cpu possible which we consider to be the pentium3. This is because any + * host CPU will already be modern than this, but it also ensures any + * newer CPU flags/features are filtered out for older guests. + */ + if (!strcmp(machine->cpu_type, X86_CPU_TYPE_NAME("host"))) { + machine->cpu_type = X86_CPU_TYPE_NAME("pentium3"); + warn_report("-cpu host is invalid for isapc machine, using pentium3"); + } + + if (machine->ram_size > 3.5 * GiB) { + error_report("Too much memory for this machine: %" PRId64 " MiB, " + "maximum 3584 MiB", machine->ram_size / MiB); + exit(1); + } + + /* + * There is no RAM split for the isapc machine + */ + if (xen_enabled()) { + xen_hvm_init_pc(pcms, &ram_memory); + } else { + ram_memory = machine->ram; + + pcms->max_ram_below_4g = 3.5 * GiB; + x86ms->above_4g_mem_size = 0; + x86ms->below_4g_mem_size = machine->ram_size; + } + + x86_cpus_init(x86ms, pcmc->default_cpu_version); + + if (kvm_enabled()) { + kvmclock_create(pcmc->kvmclock_create_always); + } + + /* allocate ram and load rom/bios */ + if (!xen_enabled()) { + pc_memory_init(pcms, system_memory, rom_memory, 0); + } else { + assert(machine->ram_size == x86ms->below_4g_mem_size + + x86ms->above_4g_mem_size); + + if (machine->kernel_filename != NULL) { + /* For xen HVM direct kernel boot, load linux here */ + xen_load_linux(pcms); + } + } + + gsi_state = pc_gsi_create(&x86ms->gsi, false); + + isa_bus = isa_bus_new(NULL, system_memory, system_io, + &error_abort); + isa_bus_register_input_irqs(isa_bus, x86ms->gsi); + + x86ms->rtc = isa_new(TYPE_MC146818_RTC); + qdev_prop_set_int32(DEVICE(x86ms->rtc), "base_year", 2000); + isa_realize_and_unref(x86ms->rtc, isa_bus, &error_fatal); + irq = object_property_get_uint(OBJECT(x86ms->rtc), "irq", + &error_fatal); + isa_connect_gpio_out(ISA_DEVICE(x86ms->rtc), 0, irq); + + i8257_dma_init(OBJECT(machine), isa_bus, 0); + pcms->hpet_enabled = false; + + if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) { + pc_i8259_create(isa_bus, gsi_state->i8259_irq); + } + + if (tcg_enabled()) { + x86_register_ferr_irq(x86ms->gsi[13]); + } + + pc_vga_init(isa_bus, NULL); + + /* init basic PC hardware */ + pc_basic_device_init(pcms, isa_bus, x86ms->gsi, x86ms->rtc, + !MACHINE_CLASS(pcmc)->no_floppy, 0x4); + + pc_nic_init(pcmc, isa_bus, NULL); + + ide_drive_get(hd, ARRAY_SIZE(hd)); + for (i = 0; i < MAX_IDE_BUS; i++) { + ISADevice *dev; + char busname[] = "ide.0"; + dev = isa_ide_init(isa_bus, ide_iobase[i], ide_iobase2[i], + ide_irq[i], + hd[MAX_IDE_DEVS * i], hd[MAX_IDE_DEVS * i + 1]); + /* + * The ide bus name is ide.0 for the first bus and ide.1 for the + * second one. + */ + busname[4] = '0' + i; + pcms->idebus[i] = qdev_get_child_bus(DEVICE(dev), busname); + } +} + +static void isapc_machine_options(MachineClass *m) +{ + static const char * const valid_cpu_types[] = { + X86_CPU_TYPE_NAME("486"), + X86_CPU_TYPE_NAME("athlon"), + X86_CPU_TYPE_NAME("kvm32"), + X86_CPU_TYPE_NAME("pentium"), + X86_CPU_TYPE_NAME("pentium2"), + X86_CPU_TYPE_NAME("pentium3"), + X86_CPU_TYPE_NAME("qemu32"), + X86_CPU_TYPE_NAME("max"), + X86_CPU_TYPE_NAME("host"), + NULL + }; + PCMachineClass *pcmc = PC_MACHINE_CLASS(m); + + m->desc = "ISA-only PC"; + m->max_cpus = 1; + m->option_rom_has_mr = true; + m->rom_file_has_mr = false; + pcmc->pci_enabled = false; + pcmc->has_acpi_build = false; + pcmc->smbios_defaults = false; + pcmc->gigabyte_align = false; + pcmc->smbios_legacy_mode = true; + pcmc->has_reserved_memory = false; + m->default_nic = "ne2k_isa"; + m->default_cpu_type = X86_CPU_TYPE_NAME("486"); + m->valid_cpu_types = valid_cpu_types; + m->no_floppy = !module_object_class_by_name(TYPE_ISA_FDC); + m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL); +} + +DEFINE_PC_MACHINE(isapc, "isapc", pc_init_isa, + isapc_machine_options); diff --git a/hw/i386/meson.build b/hw/i386/meson.build index 7896f348cf..436b3ce52d 100644 --- a/hw/i386/meson.build +++ b/hw/i386/meson.build @@ -14,6 +14,7 @@ i386_ss.add(when: 'CONFIG_X86_IOMMU', if_true: files('x86-iommu.c'), i386_ss.add(when: 'CONFIG_AMD_IOMMU', if_true: files('amd_iommu.c'), if_false: files('amd_iommu-stub.c')) i386_ss.add(when: 'CONFIG_I440FX', if_true: files('pc_piix.c')) +i386_ss.add(when: 'CONFIG_ISAPC', if_true: files('isapc.c')) i386_ss.add(when: 'CONFIG_MICROVM', if_true: files('x86-common.c', 'microvm.c', 'acpi-microvm.c', 'microvm-dt.c')) i386_ss.add(when: 'CONFIG_NITRO_ENCLAVE', if_true: files('nitro_enclave.c')) i386_ss.add(when: 'CONFIG_Q35', if_true: files('pc_q35.c')) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 3ea77b2c44..988c9edc32 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -71,12 +71,6 @@ #define XEN_IOAPIC_NUM_PIRQS 128ULL -#ifdef CONFIG_IDE_ISA -static const int ide_iobase[MAX_IDE_BUS] = { 0x1f0, 0x170 }; -static const int ide_iobase2[MAX_IDE_BUS] = { 0x3f6, 0x376 }; -static const int ide_irq[MAX_IDE_BUS] = { 14, 15 }; -#endif - static GlobalProperty pc_piix_compat_defaults[] = { { TYPE_RAMFB_DEVICE, "use-legacy-x86-rom", "true" }, { TYPE_VFIO_PCI_NOHOTPLUG, "use-legacy-x86-rom", "true" }, @@ -392,134 +386,6 @@ static void pc_set_south_bridge(Object *obj, int value, Error **errp) pcms->south_bridge = PCSouthBridgeOption_lookup.array[value]; } -#ifdef CONFIG_ISAPC -static void pc_init_isa(MachineState *machine) -{ - PCMachineState *pcms = PC_MACHINE(machine); - PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); - X86MachineState *x86ms = X86_MACHINE(machine); - MemoryRegion *system_memory = get_system_memory(); - MemoryRegion *system_io = get_system_io(); - ISABus *isa_bus; - uint32_t irq; - GSIState *gsi_state; - MemoryRegion *ram_memory; - MemoryRegion *rom_memory = system_memory; - DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS]; - int i; - - /* - * There is a small chance that someone unintentionally passes "-cpu max" - * for the isapc machine, which will provide a much more modern 32-bit - * CPU than would be expected for an ISA-era PC. If the "max" cpu type has - * been specified, choose the "best" 32-bit cpu possible which we consider - * be the pentium3 (deliberately choosing an Intel CPU given that the - * default 486 CPU for the isapc machine is also an Intel CPU). - */ - if (!strcmp(machine->cpu_type, X86_CPU_TYPE_NAME("max"))) { - machine->cpu_type = X86_CPU_TYPE_NAME("pentium3"); - warn_report("-cpu max is invalid for isapc machine, using pentium3"); - } - - /* - * Similarly if someone unintentionally passes "-cpu host" for the isapc - * machine then display a warning and also switch to the "best" 32-bit - * cpu possible which we consider to be the pentium3. This is because any - * host CPU will already be modern than this, but it also ensures any - * newer CPU flags/features are filtered out for older guests. - */ - if (!strcmp(machine->cpu_type, X86_CPU_TYPE_NAME("host"))) { - machine->cpu_type = X86_CPU_TYPE_NAME("pentium3"); - warn_report("-cpu host is invalid for isapc machine, using pentium3"); - } - - if (machine->ram_size > 3.5 * GiB) { - error_report("Too much memory for this machine: %" PRId64 " MiB, " - "maximum 3584 MiB", machine->ram_size / MiB); - exit(1); - } - - /* - * There is no RAM split for the isapc machine - */ - if (xen_enabled()) { - xen_hvm_init_pc(pcms, &ram_memory); - } else { - ram_memory = machine->ram; - - pcms->max_ram_below_4g = 3.5 * GiB; - x86ms->above_4g_mem_size = 0; - x86ms->below_4g_mem_size = machine->ram_size; - } - - x86_cpus_init(x86ms, pcmc->default_cpu_version); - - if (kvm_enabled()) { - kvmclock_create(pcmc->kvmclock_create_always); - } - - /* allocate ram and load rom/bios */ - if (!xen_enabled()) { - pc_memory_init(pcms, system_memory, rom_memory, 0); - } else { - assert(machine->ram_size == x86ms->below_4g_mem_size + - x86ms->above_4g_mem_size); - - if (machine->kernel_filename != NULL) { - /* For xen HVM direct kernel boot, load linux here */ - xen_load_linux(pcms); - } - } - - gsi_state = pc_gsi_create(&x86ms->gsi, false); - - isa_bus = isa_bus_new(NULL, system_memory, system_io, - &error_abort); - isa_bus_register_input_irqs(isa_bus, x86ms->gsi); - - x86ms->rtc = isa_new(TYPE_MC146818_RTC); - qdev_prop_set_int32(DEVICE(x86ms->rtc), "base_year", 2000); - isa_realize_and_unref(x86ms->rtc, isa_bus, &error_fatal); - irq = object_property_get_uint(OBJECT(x86ms->rtc), "irq", - &error_fatal); - isa_connect_gpio_out(ISA_DEVICE(x86ms->rtc), 0, irq); - - i8257_dma_init(OBJECT(machine), isa_bus, 0); - pcms->hpet_enabled = false; - - if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) { - pc_i8259_create(isa_bus, gsi_state->i8259_irq); - } - - if (tcg_enabled()) { - x86_register_ferr_irq(x86ms->gsi[13]); - } - - pc_vga_init(isa_bus, NULL); - - /* init basic PC hardware */ - pc_basic_device_init(pcms, isa_bus, x86ms->gsi, x86ms->rtc, - !MACHINE_CLASS(pcmc)->no_floppy, 0x4); - - pc_nic_init(pcmc, isa_bus, NULL); - - ide_drive_get(hd, ARRAY_SIZE(hd)); - for (i = 0; i < MAX_IDE_BUS; i++) { - ISADevice *dev; - char busname[] = "ide.0"; - dev = isa_ide_init(isa_bus, ide_iobase[i], ide_iobase2[i], - ide_irq[i], - hd[MAX_IDE_DEVS * i], hd[MAX_IDE_DEVS * i + 1]); - /* - * The ide bus name is ide.0 for the first bus and ide.1 for the - * second one. - */ - busname[4] = '0' + i; - pcms->idebus[i] = qdev_get_child_bus(DEVICE(dev), busname); - } -} -#endif - #ifdef CONFIG_XEN static void pc_xen_hvm_init(MachineState *machine) { @@ -887,44 +753,6 @@ static void pc_i440fx_machine_2_6_options(MachineClass *m) DEFINE_I440FX_MACHINE(2, 6); -#ifdef CONFIG_ISAPC -static void isapc_machine_options(MachineClass *m) -{ - static const char * const valid_cpu_types[] = { - X86_CPU_TYPE_NAME("486"), - X86_CPU_TYPE_NAME("athlon"), - X86_CPU_TYPE_NAME("kvm32"), - X86_CPU_TYPE_NAME("pentium"), - X86_CPU_TYPE_NAME("pentium2"), - X86_CPU_TYPE_NAME("pentium3"), - X86_CPU_TYPE_NAME("qemu32"), - X86_CPU_TYPE_NAME("max"), - X86_CPU_TYPE_NAME("host"), - NULL - }; - PCMachineClass *pcmc = PC_MACHINE_CLASS(m); - - m->desc = "ISA-only PC"; - m->max_cpus = 1; - m->option_rom_has_mr = true; - m->rom_file_has_mr = false; - pcmc->pci_enabled = false; - pcmc->has_acpi_build = false; - pcmc->smbios_defaults = false; - pcmc->gigabyte_align = false; - pcmc->smbios_legacy_mode = true; - pcmc->has_reserved_memory = false; - m->default_nic = "ne2k_isa"; - m->default_cpu_type = X86_CPU_TYPE_NAME("486"); - m->valid_cpu_types = valid_cpu_types; - m->no_floppy = !module_object_class_by_name(TYPE_ISA_FDC); - m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL); -} - -DEFINE_PC_MACHINE(isapc, "isapc", pc_init_isa, - isapc_machine_options); -#endif - #ifdef CONFIG_XEN static void xenfv_machine_4_2_options(MachineClass *m) { From d773f9689109eb00ba6627c2264af949c66897fb Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:10:00 +0100 Subject: [PATCH 17/28] hw/i386/pc_piix.c: remove unused headers after isapc machine split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The headers for isapc-only devices can be removed from pc_piix.c since they are no longer used by the i440fx-pc machine. Signed-off-by: Mark Cave-Ayland Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250828111057.468712-18-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/pc_piix.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 988c9edc32..627de09c70 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -27,19 +27,16 @@ #include "qemu/units.h" #include "hw/char/parallel-isa.h" -#include "hw/dma/i8257.h" #include "hw/i386/x86.h" #include "hw/i386/pc.h" #include "hw/i386/apic.h" #include "hw/pci-host/i440fx.h" -#include "hw/rtc/mc146818rtc.h" #include "hw/southbridge/piix.h" #include "hw/display/ramfb.h" #include "hw/pci/pci.h" #include "hw/pci/pci_ids.h" #include "hw/usb.h" #include "net/net.h" -#include "hw/ide/isa.h" #include "hw/ide/pci.h" #include "hw/irq.h" #include "system/kvm.h" From 523a64f388a689a1e9bc593ca8768fe1449613db Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:10:01 +0100 Subject: [PATCH 18/28] hw/i386/pc_piix.c: replace rom_memory with pci_memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we can guarantee the i440fx-pc machine will always have a PCI bus, any instances of rom_memory can be replaced by pci_memory and rom_memory removed completely. Signed-off-by: Mark Cave-Ayland Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250828111057.468712-19-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/pc_piix.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 627de09c70..7e78b6daa6 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -113,7 +113,6 @@ static void pc_init1(MachineState *machine, const char *pci_type) GSIState *gsi_state; MemoryRegion *ram_memory; MemoryRegion *pci_memory = NULL; - MemoryRegion *rom_memory = system_memory; ram_addr_t lowmem; uint64_t hole64_size = 0; PCIDevice *pci_dev; @@ -193,7 +192,6 @@ static void pc_init1(MachineState *machine, const char *pci_type) pci_memory = g_new(MemoryRegion, 1); memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); - rom_memory = pci_memory; phb = OBJECT(qdev_new(TYPE_I440FX_PCI_HOST_BRIDGE)); object_property_add_child(OBJECT(machine), "i440fx", phb); @@ -224,7 +222,7 @@ static void pc_init1(MachineState *machine, const char *pci_type) /* allocate ram and load rom/bios */ if (!xen_enabled()) { - pc_memory_init(pcms, system_memory, rom_memory, hole64_size); + pc_memory_init(pcms, system_memory, pci_memory, hole64_size); } else { assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size); From d8701867d12241f53f3b17973e7fd533c764c76a Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Thu, 28 Aug 2025 12:10:02 +0100 Subject: [PATCH 19/28] hw/i386/isapc.c: replace rom_memory with system_memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we can guarantee the isapc machine will never have a PCI bus, any instances of rom_memory can be replaced by system_memory and rom_memory removed completely. Signed-off-by: Mark Cave-Ayland Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250828111057.468712-20-mark.caveayland@nutanix.com Signed-off-by: Paolo Bonzini --- hw/i386/isapc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/i386/isapc.c b/hw/i386/isapc.c index 300d64b7ad..44f4a44672 100644 --- a/hw/i386/isapc.c +++ b/hw/i386/isapc.c @@ -38,7 +38,6 @@ static void pc_init_isa(MachineState *machine) uint32_t irq; GSIState *gsi_state; MemoryRegion *ram_memory; - MemoryRegion *rom_memory = system_memory; DriveInfo *hd[MAX_IDE_BUS * MAX_IDE_DEVS]; int i; @@ -94,7 +93,7 @@ static void pc_init_isa(MachineState *machine) /* allocate ram and load rom/bios */ if (!xen_enabled()) { - pc_memory_init(pcms, system_memory, rom_memory, 0); + pc_memory_init(pcms, system_memory, system_memory, 0); } else { assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size); From b8217bbaf2bafef1a4f54082a3548613eeef8f2b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 29 Aug 2025 10:46:48 +0200 Subject: [PATCH 20/28] user-exec: ensure interrupt_request is not used cpu_interrupt() is not called anymore except by ARM but even there it is dead code; disentangling the various cpregs accessors from user-mode emulation is a work in progress. Signed-off-by: Paolo Bonzini --- accel/tcg/cpu-exec.c | 6 ++++-- accel/tcg/user-exec.c | 4 +--- include/hw/core/cpu.h | 1 + 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index 713bdb2056..b44dd1e820 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -778,6 +778,9 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, */ qatomic_set_mb(&cpu->neg.icount_decr.u16.high, 0); +#ifdef CONFIG_USER_ONLY + g_assert(!qatomic_read(&cpu->interrupt_request)); +#else if (unlikely(qatomic_read(&cpu->interrupt_request))) { int interrupt_request; bql_lock(); @@ -792,7 +795,6 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, bql_unlock(); return true; } -#if !defined(CONFIG_USER_ONLY) if (replay_mode == REPLAY_MODE_PLAY && !replay_has_interrupt()) { /* Do nothing */ } else if (interrupt_request & CPU_INTERRUPT_HALT) { @@ -840,7 +842,6 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, * reload the 'interrupt_request' value */ interrupt_request = cpu->interrupt_request; } -#endif /* !CONFIG_USER_ONLY */ if (interrupt_request & CPU_INTERRUPT_EXITTB) { cpu->interrupt_request &= ~CPU_INTERRUPT_EXITTB; /* ensure that no TB jump will be modified as @@ -851,6 +852,7 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, /* If we exit via cpu_loop_exit/longjmp it is reset in cpu_exec */ bql_unlock(); } +#endif /* !CONFIG_USER_ONLY */ /* Finally, check if we need to exit to the main loop. */ if (unlikely(qatomic_read(&cpu->exit_request)) || icount_exit_request(cpu)) { diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c index f25d80e2dc..748bfab04a 100644 --- a/accel/tcg/user-exec.c +++ b/accel/tcg/user-exec.c @@ -48,9 +48,7 @@ __thread uintptr_t helper_retaddr; void cpu_interrupt(CPUState *cpu, int mask) { - g_assert(bql_locked()); - cpu->interrupt_request |= mask; - qatomic_set(&cpu->neg.icount_decr.u16.high, -1); + g_assert_not_reached(); } /* diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index 5eaf41a566..f73b4357c7 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -423,6 +423,7 @@ struct qemu_work_item; * @created: Indicates whether the CPU thread has been successfully created. * @halt_cond: condition variable sleeping threads can wait on. * @interrupt_request: Indicates a pending interrupt request. + * Only used by system emulation. * @halted: Nonzero if the CPU is in suspended state. * @stop: Indicates a pending stop request. * @stopped: Indicates the CPU has been artificially stopped. From 87511341c30d8c9c77178db16491a0ccacc5d64b Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Thu, 21 Aug 2025 17:56:03 +0200 Subject: [PATCH 21/28] add cpu_test_interrupt()/cpu_set_interrupt() helpers and use them tree wide The helpers form load-acquire/store-release pair and ensure that appropriate barriers are in place in case checks happen outside of BQL. Use them to replace open-coded checkers/setters across the code, to make sure that barriers are not missed. Helpers also make code a bit more readable. Signed-off-by: Igor Mammedov Reviewed-by: Peter Xu Reviewed-by: Jason J. Herne Link: https://lore.kernel.org/r/20250821155603.2422553-1-imammedo@redhat.com Signed-off-by: Paolo Bonzini --- accel/tcg/cpu-exec.c | 12 +++++----- accel/tcg/tcg-accel-ops.c | 2 +- hw/intc/s390_flic.c | 2 +- hw/openrisc/cputimer.c | 2 +- include/hw/core/cpu.h | 22 +++++++++++++++++++ system/cpus.c | 9 +++++++- target/alpha/cpu.c | 8 +++---- target/arm/cpu.c | 20 ++++++++--------- target/arm/helper.c | 18 +++++++-------- target/arm/hvf/hvf.c | 6 ++--- target/avr/cpu.c | 2 +- target/hppa/cpu.c | 2 +- target/i386/hvf/hvf.c | 4 ++-- target/i386/hvf/x86hvf.c | 21 +++++++++--------- target/i386/kvm/kvm.c | 34 ++++++++++++++--------------- target/i386/nvmm/nvmm-all.c | 24 ++++++++++---------- target/i386/tcg/system/seg_helper.c | 2 +- target/i386/tcg/system/svm_helper.c | 2 +- target/i386/whpx/whpx-all.c | 34 ++++++++++++++--------------- target/loongarch/cpu.c | 2 +- target/m68k/cpu.c | 2 +- target/microblaze/cpu.c | 2 +- target/mips/cpu.c | 6 ++--- target/mips/kvm.c | 2 +- target/openrisc/cpu.c | 3 +-- target/ppc/cpu_init.c | 2 +- target/ppc/kvm.c | 2 +- target/rx/cpu.c | 3 +-- target/rx/helper.c | 2 +- target/s390x/cpu-system.c | 2 +- target/sh4/cpu.c | 2 +- target/sh4/helper.c | 2 +- target/sparc/cpu.c | 2 +- target/sparc/int64_helper.c | 4 ++-- 34 files changed, 145 insertions(+), 119 deletions(-) diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index b44dd1e820..96c124aa72 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -779,9 +779,9 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, qatomic_set_mb(&cpu->neg.icount_decr.u16.high, 0); #ifdef CONFIG_USER_ONLY - g_assert(!qatomic_read(&cpu->interrupt_request)); + assert(!cpu_test_interrupt(cpu, ~0)); #else - if (unlikely(qatomic_read(&cpu->interrupt_request))) { + if (unlikely(cpu_test_interrupt(cpu, ~0))) { int interrupt_request; bql_lock(); interrupt_request = cpu->interrupt_request; @@ -789,7 +789,7 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, /* Mask out external interrupts for this step. */ interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK; } - if (interrupt_request & CPU_INTERRUPT_DEBUG) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_DEBUG)) { cpu->interrupt_request &= ~CPU_INTERRUPT_DEBUG; cpu->exception_index = EXCP_DEBUG; bql_unlock(); @@ -797,7 +797,7 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, } if (replay_mode == REPLAY_MODE_PLAY && !replay_has_interrupt()) { /* Do nothing */ - } else if (interrupt_request & CPU_INTERRUPT_HALT) { + } else if (cpu_test_interrupt(cpu, CPU_INTERRUPT_HALT)) { replay_interrupt(); cpu->interrupt_request &= ~CPU_INTERRUPT_HALT; cpu->halted = 1; @@ -807,7 +807,7 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, } else { const TCGCPUOps *tcg_ops = cpu->cc->tcg_ops; - if (interrupt_request & CPU_INTERRUPT_RESET) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_RESET)) { replay_interrupt(); tcg_ops->cpu_exec_reset(cpu); bql_unlock(); @@ -842,7 +842,7 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, * reload the 'interrupt_request' value */ interrupt_request = cpu->interrupt_request; } - if (interrupt_request & CPU_INTERRUPT_EXITTB) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_EXITTB)) { cpu->interrupt_request &= ~CPU_INTERRUPT_EXITTB; /* ensure that no TB jump will be modified as the program flow was changed */ diff --git a/accel/tcg/tcg-accel-ops.c b/accel/tcg/tcg-accel-ops.c index 3b0d7d298e..9c37266c1e 100644 --- a/accel/tcg/tcg-accel-ops.c +++ b/accel/tcg/tcg-accel-ops.c @@ -97,7 +97,7 @@ static void tcg_cpu_reset_hold(CPUState *cpu) /* mask must never be zero, except for A20 change call */ void tcg_handle_interrupt(CPUState *cpu, int mask) { - cpu->interrupt_request |= mask; + cpu_set_interrupt(cpu, mask); /* * If called from iothread context, wake the target cpu in diff --git a/hw/intc/s390_flic.c b/hw/intc/s390_flic.c index 8f4c9fd52e..1eed5125d1 100644 --- a/hw/intc/s390_flic.c +++ b/hw/intc/s390_flic.c @@ -190,7 +190,7 @@ static void qemu_s390_flic_notify(uint32_t type) CPU_FOREACH(cs) { S390CPU *cpu = S390_CPU(cs); - cs->interrupt_request |= CPU_INTERRUPT_HARD; + cpu_set_interrupt(cs, CPU_INTERRUPT_HARD); /* ignore CPUs that are not sleeping */ if (s390_cpu_get_state(cpu) != S390_CPU_STATE_OPERATING && diff --git a/hw/openrisc/cputimer.c b/hw/openrisc/cputimer.c index 6331997d56..51da226fcd 100644 --- a/hw/openrisc/cputimer.c +++ b/hw/openrisc/cputimer.c @@ -105,7 +105,7 @@ static void openrisc_timer_cb(void *opaque) CPUState *cs = CPU(cpu); cpu->env.ttmr |= TTMR_IP; - cs->interrupt_request |= CPU_INTERRUPT_TIMER; + cpu_set_interrupt(cs, CPU_INTERRUPT_TIMER); } switch (cpu->env.ttmr & TTMR_M) { diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index f73b4357c7..b01a0cffd6 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -943,6 +943,28 @@ CPUState *cpu_by_arch_id(int64_t id); void cpu_interrupt(CPUState *cpu, int mask); +/** + * cpu_test_interrupt: + * @cpu: The CPU to check interrupt(s) on. + * @mask: The interrupts to check. + * + * Checks if any of interrupts in @mask are pending on @cpu. + */ +static inline bool cpu_test_interrupt(CPUState *cpu, int mask) +{ + return qatomic_load_acquire(&cpu->interrupt_request) & mask; +} + +/** + * cpu_set_interrupt: + * @cpu: The CPU to set pending interrupt(s) on. + * @mask: The interrupts to set. + * + * Sets interrupts in @mask as pending on @cpu. Unlike @cpu_interrupt, + * this does not kick the vCPU. + */ +void cpu_set_interrupt(CPUState *cpu, int mask); + /** * cpu_set_pc: * @cpu: The CPU to set the program counter for. diff --git a/system/cpus.c b/system/cpus.c index 256723558d..437848b5eb 100644 --- a/system/cpus.c +++ b/system/cpus.c @@ -254,9 +254,16 @@ int64_t cpus_get_elapsed_ticks(void) return cpu_get_ticks(); } +void cpu_set_interrupt(CPUState *cpu, int mask) +{ + /* Pairs with cpu_test_interrupt(). */ + qatomic_store_release(&cpu->interrupt_request, + cpu->interrupt_request | mask); +} + void generic_handle_interrupt(CPUState *cpu, int mask) { - cpu->interrupt_request |= mask; + cpu_set_interrupt(cpu, mask); if (!qemu_cpu_is_self(cpu)) { qemu_cpu_kick(cpu); diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c index bf1787a69d..932cddac05 100644 --- a/target/alpha/cpu.c +++ b/target/alpha/cpu.c @@ -86,10 +86,10 @@ static bool alpha_cpu_has_work(CPUState *cs) assume that if a CPU really wants to stay asleep, it will mask interrupts at the chipset level, which will prevent these bits from being set in the first place. */ - return cs->interrupt_request & (CPU_INTERRUPT_HARD - | CPU_INTERRUPT_TIMER - | CPU_INTERRUPT_SMP - | CPU_INTERRUPT_MCHK); + return cpu_test_interrupt(cs, CPU_INTERRUPT_HARD + | CPU_INTERRUPT_TIMER + | CPU_INTERRUPT_SMP + | CPU_INTERRUPT_MCHK); } #endif /* !CONFIG_USER_ONLY */ diff --git a/target/arm/cpu.c b/target/arm/cpu.c index e2b2337399..a29c3facbf 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -142,11 +142,11 @@ static bool arm_cpu_has_work(CPUState *cs) ARMCPU *cpu = ARM_CPU(cs); return (cpu->power_state != PSCI_OFF) - && cs->interrupt_request & - (CPU_INTERRUPT_FIQ | CPU_INTERRUPT_HARD - | CPU_INTERRUPT_NMI | CPU_INTERRUPT_VINMI | CPU_INTERRUPT_VFNMI - | CPU_INTERRUPT_VFIQ | CPU_INTERRUPT_VIRQ | CPU_INTERRUPT_VSERR - | CPU_INTERRUPT_EXITTB); + && cpu_test_interrupt(cs, + CPU_INTERRUPT_FIQ | CPU_INTERRUPT_HARD + | CPU_INTERRUPT_NMI | CPU_INTERRUPT_VINMI | CPU_INTERRUPT_VFNMI + | CPU_INTERRUPT_VFIQ | CPU_INTERRUPT_VIRQ | CPU_INTERRUPT_VSERR + | CPU_INTERRUPT_EXITTB); } #endif /* !CONFIG_USER_ONLY */ @@ -958,7 +958,7 @@ void arm_cpu_update_virq(ARMCPU *cpu) !(arm_hcrx_el2_eff(env) & HCRX_VINMI)) || (env->irq_line_state & CPU_INTERRUPT_VIRQ); - if (new_state != ((cs->interrupt_request & CPU_INTERRUPT_VIRQ) != 0)) { + if (new_state != cpu_test_interrupt(cs, CPU_INTERRUPT_VIRQ)) { if (new_state) { cpu_interrupt(cs, CPU_INTERRUPT_VIRQ); } else { @@ -980,7 +980,7 @@ void arm_cpu_update_vfiq(ARMCPU *cpu) !(arm_hcrx_el2_eff(env) & HCRX_VFNMI)) || (env->irq_line_state & CPU_INTERRUPT_VFIQ); - if (new_state != ((cs->interrupt_request & CPU_INTERRUPT_VFIQ) != 0)) { + if (new_state != cpu_test_interrupt(cs, CPU_INTERRUPT_VFIQ)) { if (new_state) { cpu_interrupt(cs, CPU_INTERRUPT_VFIQ); } else { @@ -1002,7 +1002,7 @@ void arm_cpu_update_vinmi(ARMCPU *cpu) (arm_hcrx_el2_eff(env) & HCRX_VINMI)) || (env->irq_line_state & CPU_INTERRUPT_VINMI); - if (new_state != ((cs->interrupt_request & CPU_INTERRUPT_VINMI) != 0)) { + if (new_state != cpu_test_interrupt(cs, CPU_INTERRUPT_VINMI)) { if (new_state) { cpu_interrupt(cs, CPU_INTERRUPT_VINMI); } else { @@ -1022,7 +1022,7 @@ void arm_cpu_update_vfnmi(ARMCPU *cpu) bool new_state = (arm_hcr_el2_eff(env) & HCR_VF) && (arm_hcrx_el2_eff(env) & HCRX_VFNMI); - if (new_state != ((cs->interrupt_request & CPU_INTERRUPT_VFNMI) != 0)) { + if (new_state != cpu_test_interrupt(cs, CPU_INTERRUPT_VFNMI)) { if (new_state) { cpu_interrupt(cs, CPU_INTERRUPT_VFNMI); } else { @@ -1041,7 +1041,7 @@ void arm_cpu_update_vserr(ARMCPU *cpu) bool new_state = env->cp15.hcr_el2 & HCR_VSE; - if (new_state != ((cs->interrupt_request & CPU_INTERRUPT_VSERR) != 0)) { + if (new_state != cpu_test_interrupt(cs, CPU_INTERRUPT_VSERR)) { if (new_state) { cpu_interrupt(cs, CPU_INTERRUPT_VSERR); } else { diff --git a/target/arm/helper.c b/target/arm/helper.c index 0c1299ff84..4cd36e950a 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -833,40 +833,40 @@ static uint64_t isr_read(CPUARMState *env, const ARMCPRegInfo *ri) uint64_t ret = 0; if (hcr_el2 & HCR_IMO) { - if (cs->interrupt_request & CPU_INTERRUPT_VIRQ) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_VIRQ)) { ret |= CPSR_I; } - if (cs->interrupt_request & CPU_INTERRUPT_VINMI) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_VINMI)) { ret |= ISR_IS; ret |= CPSR_I; } } else { - if (cs->interrupt_request & CPU_INTERRUPT_HARD) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_HARD)) { ret |= CPSR_I; } - if (cs->interrupt_request & CPU_INTERRUPT_NMI) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_NMI)) { ret |= ISR_IS; ret |= CPSR_I; } } if (hcr_el2 & HCR_FMO) { - if (cs->interrupt_request & CPU_INTERRUPT_VFIQ) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_VFIQ)) { ret |= CPSR_F; } - if (cs->interrupt_request & CPU_INTERRUPT_VFNMI) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_VFNMI)) { ret |= ISR_FS; ret |= CPSR_F; } } else { - if (cs->interrupt_request & CPU_INTERRUPT_FIQ) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_FIQ)) { ret |= CPSR_F; } } if (hcr_el2 & HCR_AMO) { - if (cs->interrupt_request & CPU_INTERRUPT_VSERR) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_VSERR)) { ret |= CPSR_A; } } @@ -9147,7 +9147,7 @@ void arm_cpu_do_interrupt(CPUState *cs) arm_call_el_change_hook(cpu); if (!kvm_enabled()) { - cs->interrupt_request |= CPU_INTERRUPT_EXITTB; + cpu_set_interrupt(cs, CPU_INTERRUPT_EXITTB); } } #endif /* !CONFIG_USER_ONLY */ diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c index 47b0cd3a35..b77db99079 100644 --- a/target/arm/hvf/hvf.c +++ b/target/arm/hvf/hvf.c @@ -1782,13 +1782,13 @@ static int hvf_sysreg_write(CPUState *cpu, uint32_t reg, uint64_t val) static int hvf_inject_interrupts(CPUState *cpu) { - if (cpu->interrupt_request & CPU_INTERRUPT_FIQ) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_FIQ)) { trace_hvf_inject_fiq(); hv_vcpu_set_pending_interrupt(cpu->accel->fd, HV_INTERRUPT_TYPE_FIQ, true); } - if (cpu->interrupt_request & CPU_INTERRUPT_HARD) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD)) { trace_hvf_inject_irq(); hv_vcpu_set_pending_interrupt(cpu->accel->fd, HV_INTERRUPT_TYPE_IRQ, true); @@ -1840,7 +1840,7 @@ static void hvf_wfi(CPUState *cpu) uint64_t nanos; uint32_t cntfrq; - if (cpu->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ)) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIQ)) { /* Interrupt pending, no need to wait */ return; } diff --git a/target/avr/cpu.c b/target/avr/cpu.c index 6995de6a12..a6df71d020 100644 --- a/target/avr/cpu.c +++ b/target/avr/cpu.c @@ -45,7 +45,7 @@ static vaddr avr_cpu_get_pc(CPUState *cs) static bool avr_cpu_has_work(CPUState *cs) { - return (cs->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_RESET)) + return cpu_test_interrupt(cs, CPU_INTERRUPT_HARD | CPU_INTERRUPT_RESET) && cpu_interrupts_enabled(cpu_env(cs)); } diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c index 24777727e6..0ca79ee5e2 100644 --- a/target/hppa/cpu.c +++ b/target/hppa/cpu.c @@ -135,7 +135,7 @@ static void hppa_restore_state_to_opc(CPUState *cs, #ifndef CONFIG_USER_ONLY static bool hppa_cpu_has_work(CPUState *cs) { - return cs->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI); + return cpu_test_interrupt(cs, CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI); } #endif /* !CONFIG_USER_ONLY */ diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c index 818b50419f..8445cadece 100644 --- a/target/i386/hvf/hvf.c +++ b/target/i386/hvf/hvf.c @@ -773,9 +773,9 @@ int hvf_vcpu_exec(CPUState *cpu) switch (exit_reason) { case EXIT_REASON_HLT: { macvm_set_rip(cpu, rip + ins_len); - if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) && + if (!(cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD) && (env->eflags & IF_MASK)) - && !(cpu->interrupt_request & CPU_INTERRUPT_NMI) && + && !cpu_test_interrupt(cpu, CPU_INTERRUPT_NMI) && !(idtvec_info & VMCS_IDT_VEC_VALID)) { cpu->halted = 1; ret = EXCP_HLT; diff --git a/target/i386/hvf/x86hvf.c b/target/i386/hvf/x86hvf.c index 17fce1d3cd..9e05e0e576 100644 --- a/target/i386/hvf/x86hvf.c +++ b/target/i386/hvf/x86hvf.c @@ -395,7 +395,7 @@ bool hvf_inject_interrupts(CPUState *cs) }; } - if (cs->interrupt_request & CPU_INTERRUPT_NMI) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_NMI)) { if (!(env->hflags2 & HF2_NMI_MASK) && !(info & VMCS_INTR_VALID)) { cs->interrupt_request &= ~CPU_INTERRUPT_NMI; info = VMCS_INTR_VALID | VMCS_INTR_T_NMI | EXCP02_NMI; @@ -406,7 +406,7 @@ bool hvf_inject_interrupts(CPUState *cs) } if (!(env->hflags & HF_INHIBIT_IRQ_MASK) && - (cs->interrupt_request & CPU_INTERRUPT_HARD) && + cpu_test_interrupt(cs, CPU_INTERRUPT_HARD) && (env->eflags & IF_MASK) && !(info & VMCS_INTR_VALID)) { int line = cpu_get_pic_interrupt(env); cs->interrupt_request &= ~CPU_INTERRUPT_HARD; @@ -415,11 +415,10 @@ bool hvf_inject_interrupts(CPUState *cs) VMCS_INTR_VALID | VMCS_INTR_T_HWINTR); } } - if (cs->interrupt_request & CPU_INTERRUPT_HARD) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_HARD)) { vmx_set_int_window_exiting(cs); } - return (cs->interrupt_request - & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)); + return cpu_test_interrupt(cs, CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR); } int hvf_process_events(CPUState *cs) @@ -432,25 +431,25 @@ int hvf_process_events(CPUState *cs) env->eflags = rreg(cs->accel->fd, HV_X86_RFLAGS); } - if (cs->interrupt_request & CPU_INTERRUPT_INIT) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_INIT)) { cpu_synchronize_state(cs); do_cpu_init(cpu); } - if (cs->interrupt_request & CPU_INTERRUPT_POLL) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_POLL)) { cs->interrupt_request &= ~CPU_INTERRUPT_POLL; apic_poll_irq(cpu->apic_state); } - if (((cs->interrupt_request & CPU_INTERRUPT_HARD) && + if ((cpu_test_interrupt(cs, CPU_INTERRUPT_HARD) && (env->eflags & IF_MASK)) || - (cs->interrupt_request & CPU_INTERRUPT_NMI)) { + cpu_test_interrupt(cs, CPU_INTERRUPT_NMI)) { cs->halted = 0; } - if (cs->interrupt_request & CPU_INTERRUPT_SIPI) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_SIPI)) { cpu_synchronize_state(cs); do_cpu_sipi(cpu); } - if (cs->interrupt_request & CPU_INTERRUPT_TPR) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_TPR)) { cs->interrupt_request &= ~CPU_INTERRUPT_TPR; cpu_synchronize_state(cs); apic_handle_tpr_access_report(cpu->apic_state, env->eip, diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 369626f8c8..a7b5c8f81b 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -5453,8 +5453,8 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run) int ret; /* Inject NMI */ - if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { - if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_NMI)) { bql_lock(); cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; bql_unlock(); @@ -5465,7 +5465,7 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run) strerror(-ret)); } } - if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_SMI)) { bql_lock(); cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; bql_unlock(); @@ -5486,12 +5486,12 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run) * or (for userspace APIC, but it is cheap to combine the checks here) * pending TPR access reports. */ - if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { - if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_INIT) && !(env->hflags & HF_SMM_MASK)) { cpu->exit_request = 1; } - if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_TPR)) { cpu->exit_request = 1; } } @@ -5499,7 +5499,7 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run) if (!kvm_pic_in_kernel()) { /* Try to inject an interrupt if the guest can accept it */ if (run->ready_for_interrupt_injection && - (cpu->interrupt_request & CPU_INTERRUPT_HARD) && + cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD) && (env->eflags & IF_MASK)) { int irq; @@ -5523,7 +5523,7 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run) * interrupt, request an interrupt window exit. This will * cause a return to userspace as soon as the guest is ready to * receive interrupts. */ - if ((cpu->interrupt_request & CPU_INTERRUPT_HARD)) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD)) { run->request_interrupt_window = 1; } else { run->request_interrupt_window = 0; @@ -5595,7 +5595,7 @@ int kvm_arch_process_async_events(CPUState *cs) X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; - if (cs->interrupt_request & CPU_INTERRUPT_MCE) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_MCE)) { /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */ assert(env->mcg_cap); @@ -5618,7 +5618,7 @@ int kvm_arch_process_async_events(CPUState *cs) } } - if ((cs->interrupt_request & CPU_INTERRUPT_INIT) && + if (cpu_test_interrupt(cs, CPU_INTERRUPT_INIT) && !(env->hflags & HF_SMM_MASK)) { kvm_cpu_synchronize_state(cs); do_cpu_init(cpu); @@ -5628,20 +5628,20 @@ int kvm_arch_process_async_events(CPUState *cs) return 0; } - if (cs->interrupt_request & CPU_INTERRUPT_POLL) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_POLL)) { cs->interrupt_request &= ~CPU_INTERRUPT_POLL; apic_poll_irq(cpu->apic_state); } - if (((cs->interrupt_request & CPU_INTERRUPT_HARD) && + if ((cpu_test_interrupt(cs, CPU_INTERRUPT_HARD) && (env->eflags & IF_MASK)) || - (cs->interrupt_request & CPU_INTERRUPT_NMI)) { + cpu_test_interrupt(cs, CPU_INTERRUPT_NMI)) { cs->halted = 0; } - if (cs->interrupt_request & CPU_INTERRUPT_SIPI) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_SIPI)) { kvm_cpu_synchronize_state(cs); do_cpu_sipi(cpu); } - if (cs->interrupt_request & CPU_INTERRUPT_TPR) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_TPR)) { cs->interrupt_request &= ~CPU_INTERRUPT_TPR; kvm_cpu_synchronize_state(cs); apic_handle_tpr_access_report(cpu->apic_state, env->eip, @@ -5656,9 +5656,9 @@ static int kvm_handle_halt(X86CPU *cpu) CPUState *cs = CPU(cpu); CPUX86State *env = &cpu->env; - if (!((cs->interrupt_request & CPU_INTERRUPT_HARD) && + if (!(cpu_test_interrupt(cs, CPU_INTERRUPT_HARD) && (env->eflags & IF_MASK)) && - !(cs->interrupt_request & CPU_INTERRUPT_NMI)) { + !cpu_test_interrupt(cs, CPU_INTERRUPT_NMI)) { cs->halted = 1; return EXCP_HLT; } diff --git a/target/i386/nvmm/nvmm-all.c b/target/i386/nvmm/nvmm-all.c index 92e3b8b2f4..c1ac74c4f0 100644 --- a/target/i386/nvmm/nvmm-all.c +++ b/target/i386/nvmm/nvmm-all.c @@ -413,11 +413,11 @@ nvmm_vcpu_pre_run(CPUState *cpu) * Force the VCPU out of its inner loop to process any INIT requests * or commit pending TPR access. */ - if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { cpu->exit_request = 1; } - if (!has_event && (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { + if (!has_event && cpu_test_interrupt(cpu, CPU_INTERRUPT_NMI)) { if (nvmm_can_take_nmi(cpu)) { cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; event->type = NVMM_VCPU_EVENT_INTR; @@ -426,7 +426,7 @@ nvmm_vcpu_pre_run(CPUState *cpu) } } - if (!has_event && (cpu->interrupt_request & CPU_INTERRUPT_HARD)) { + if (!has_event && cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD)) { if (nvmm_can_take_int(cpu)) { cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; event->type = NVMM_VCPU_EVENT_INTR; @@ -436,7 +436,7 @@ nvmm_vcpu_pre_run(CPUState *cpu) } /* Don't want SMIs. */ - if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_SMI)) { cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; } @@ -651,9 +651,9 @@ nvmm_handle_halted(struct nvmm_machine *mach, CPUState *cpu, bql_lock(); - if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) && + if (!(cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD) && (cpu_env(cpu)->eflags & IF_MASK)) && - !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) { + !cpu_test_interrupt(cpu, CPU_INTERRUPT_NMI)) { cpu->exception_index = EXCP_HLT; cpu->halted = true; ret = 1; @@ -691,25 +691,25 @@ nvmm_vcpu_loop(CPUState *cpu) * Some asynchronous events must be handled outside of the inner * VCPU loop. They are handled here. */ - if (cpu->interrupt_request & CPU_INTERRUPT_INIT) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_INIT)) { nvmm_cpu_synchronize_state(cpu); do_cpu_init(x86_cpu); /* set int/nmi windows back to the reset state */ } - if (cpu->interrupt_request & CPU_INTERRUPT_POLL) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_POLL)) { cpu->interrupt_request &= ~CPU_INTERRUPT_POLL; apic_poll_irq(x86_cpu->apic_state); } - if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) && + if ((cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD) && (env->eflags & IF_MASK)) || - (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { + cpu_test_interrupt(cpu, CPU_INTERRUPT_NMI)) { cpu->halted = false; } - if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_SIPI)) { nvmm_cpu_synchronize_state(cpu); do_cpu_sipi(x86_cpu); } - if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_TPR)) { cpu->interrupt_request &= ~CPU_INTERRUPT_TPR; nvmm_cpu_synchronize_state(cpu); apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip, diff --git a/target/i386/tcg/system/seg_helper.c b/target/i386/tcg/system/seg_helper.c index d4ea890c12..794a23ddfc 100644 --- a/target/i386/tcg/system/seg_helper.c +++ b/target/i386/tcg/system/seg_helper.c @@ -133,7 +133,7 @@ bool x86_cpu_exec_halt(CPUState *cpu) X86CPU *x86_cpu = X86_CPU(cpu); CPUX86State *env = &x86_cpu->env; - if (cpu->interrupt_request & CPU_INTERRUPT_POLL) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_POLL)) { bql_lock(); apic_poll_irq(x86_cpu->apic_state); cpu_reset_interrupt(cpu, CPU_INTERRUPT_POLL); diff --git a/target/i386/tcg/system/svm_helper.c b/target/i386/tcg/system/svm_helper.c index dea039b87a..3569196bdd 100644 --- a/target/i386/tcg/system/svm_helper.c +++ b/target/i386/tcg/system/svm_helper.c @@ -403,7 +403,7 @@ void helper_vmrun(CPUX86State *env, int aflag, int next_eip_addend) env->hflags2 |= HF2_GIF_MASK; if (ctl_has_irq(env)) { - cs->interrupt_request |= CPU_INTERRUPT_VIRQ; + cpu_set_interrupt(cs, CPU_INTERRUPT_VIRQ); } if (virtual_gif_set(env)) { diff --git a/target/i386/whpx/whpx-all.c b/target/i386/whpx/whpx-all.c index b72dcff3c8..878cdd1668 100644 --- a/target/i386/whpx/whpx-all.c +++ b/target/i386/whpx/whpx-all.c @@ -1436,9 +1436,9 @@ static int whpx_handle_halt(CPUState *cpu) int ret = 0; bql_lock(); - if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) && + if (!(cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD) && (cpu_env(cpu)->eflags & IF_MASK)) && - !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) { + !cpu_test_interrupt(cpu, CPU_INTERRUPT_NMI)) { cpu->exception_index = EXCP_HLT; cpu->halted = true; ret = 1; @@ -1469,15 +1469,15 @@ static void whpx_vcpu_pre_run(CPUState *cpu) /* Inject NMI */ if (!vcpu->interruption_pending && - cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { - if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { + cpu_test_interrupt(cpu, CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_NMI)) { cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; vcpu->interruptable = false; new_int.InterruptionType = WHvX64PendingNmi; new_int.InterruptionPending = 1; new_int.InterruptionVector = 2; } - if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_SMI)) { cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; } } @@ -1486,12 +1486,12 @@ static void whpx_vcpu_pre_run(CPUState *cpu) * Force the VCPU out of its inner loop to process any INIT requests or * commit pending TPR access. */ - if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { - if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_INIT) && !(env->hflags & HF_SMM_MASK)) { cpu->exit_request = 1; } - if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_TPR)) { cpu->exit_request = 1; } } @@ -1501,7 +1501,7 @@ static void whpx_vcpu_pre_run(CPUState *cpu) if (!vcpu->interruption_pending && vcpu->interruptable && (env->eflags & IF_MASK)) { assert(!new_int.InterruptionPending); - if (cpu->interrupt_request & CPU_INTERRUPT_HARD) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD)) { cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; irq = cpu_get_pic_interrupt(env); if (irq >= 0) { @@ -1519,7 +1519,7 @@ static void whpx_vcpu_pre_run(CPUState *cpu) reg_count += 1; } } else if (vcpu->ready_for_pic_interrupt && - (cpu->interrupt_request & CPU_INTERRUPT_HARD)) { + cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD)) { cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; irq = cpu_get_pic_interrupt(env); if (irq >= 0) { @@ -1546,7 +1546,7 @@ static void whpx_vcpu_pre_run(CPUState *cpu) /* Update the state of the interrupt delivery notification */ if (!vcpu->window_registered && - cpu->interrupt_request & CPU_INTERRUPT_HARD) { + cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD)) { reg_values[reg_count].DeliverabilityNotifications = (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) { .InterruptNotification = 1 @@ -1599,30 +1599,30 @@ static void whpx_vcpu_process_async_events(CPUState *cpu) CPUX86State *env = &x86_cpu->env; AccelCPUState *vcpu = cpu->accel; - if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_INIT) && !(env->hflags & HF_SMM_MASK)) { whpx_cpu_synchronize_state(cpu); do_cpu_init(x86_cpu); vcpu->interruptable = true; } - if (cpu->interrupt_request & CPU_INTERRUPT_POLL) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_POLL)) { cpu->interrupt_request &= ~CPU_INTERRUPT_POLL; apic_poll_irq(x86_cpu->apic_state); } - if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) && + if ((cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD) && (env->eflags & IF_MASK)) || - (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { + cpu_test_interrupt(cpu, CPU_INTERRUPT_NMI)) { cpu->halted = false; } - if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_SIPI)) { whpx_cpu_synchronize_state(cpu); do_cpu_sipi(x86_cpu); } - if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { + if (cpu_test_interrupt(cpu, CPU_INTERRUPT_TPR)) { cpu->interrupt_request &= ~CPU_INTERRUPT_TPR; whpx_cpu_synchronize_state(cpu); apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip, diff --git a/target/loongarch/cpu.c b/target/loongarch/cpu.c index abad84c054..3a7621c0ea 100644 --- a/target/loongarch/cpu.c +++ b/target/loongarch/cpu.c @@ -376,7 +376,7 @@ static bool loongarch_cpu_has_work(CPUState *cs) { bool has_work = false; - if ((cs->interrupt_request & CPU_INTERRUPT_HARD) && + if (cpu_test_interrupt(cs, CPU_INTERRUPT_HARD) && cpu_loongarch_hw_interrupts_pending(cpu_env(cs))) { has_work = true; } diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c index 6a09db3a6f..f1b673119d 100644 --- a/target/m68k/cpu.c +++ b/target/m68k/cpu.c @@ -74,7 +74,7 @@ static void m68k_restore_state_to_opc(CPUState *cs, #ifndef CONFIG_USER_ONLY static bool m68k_cpu_has_work(CPUState *cs) { - return cs->interrupt_request & CPU_INTERRUPT_HARD; + return cpu_test_interrupt(cs, CPU_INTERRUPT_HARD); } #endif /* !CONFIG_USER_ONLY */ diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c index ee0a869a94..22231f09e6 100644 --- a/target/microblaze/cpu.c +++ b/target/microblaze/cpu.c @@ -129,7 +129,7 @@ static void mb_restore_state_to_opc(CPUState *cs, #ifndef CONFIG_USER_ONLY static bool mb_cpu_has_work(CPUState *cs) { - return cs->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI); + return cpu_test_interrupt(cs, CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI); } #endif /* !CONFIG_USER_ONLY */ diff --git a/target/mips/cpu.c b/target/mips/cpu.c index 1f6c41fd34..5989c3ba17 100644 --- a/target/mips/cpu.c +++ b/target/mips/cpu.c @@ -145,7 +145,7 @@ static bool mips_cpu_has_work(CPUState *cs) * check for interrupts that can be taken. For pre-release 6 CPUs, * check for CP0 Config7 'Wait IE ignore' bit. */ - if ((cs->interrupt_request & CPU_INTERRUPT_HARD) && + if (cpu_test_interrupt(cs, CPU_INTERRUPT_HARD) && cpu_mips_hw_interrupts_pending(env)) { if (cpu_mips_hw_interrupts_enabled(env) || (env->CP0_Config7 & (1 << CP0C7_WII)) || @@ -160,7 +160,7 @@ static bool mips_cpu_has_work(CPUState *cs) * The QEMU model will issue an _WAKE request whenever the CPUs * should be woken up. */ - if (cs->interrupt_request & CPU_INTERRUPT_WAKE) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_WAKE)) { has_work = true; } @@ -170,7 +170,7 @@ static bool mips_cpu_has_work(CPUState *cs) } /* MIPS Release 6 has the ability to halt the CPU. */ if (env->CP0_Config5 & (1 << CP0C5_VP)) { - if (cs->interrupt_request & CPU_INTERRUPT_WAKE) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_WAKE)) { has_work = true; } if (!mips_vp_active(env)) { diff --git a/target/mips/kvm.c b/target/mips/kvm.c index ec53acb51a..450947c3fa 100644 --- a/target/mips/kvm.c +++ b/target/mips/kvm.c @@ -144,7 +144,7 @@ void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run) bql_lock(); - if ((cs->interrupt_request & CPU_INTERRUPT_HARD) && + if (cpu_test_interrupt(cs, CPU_INTERRUPT_HARD) && cpu_mips_io_interrupts_pending(cpu)) { intr.cpu = -1; intr.irq = 2; diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c index dfbb2df643..9bbfe22ed3 100644 --- a/target/openrisc/cpu.c +++ b/target/openrisc/cpu.c @@ -78,8 +78,7 @@ static void openrisc_restore_state_to_opc(CPUState *cs, #ifndef CONFIG_USER_ONLY static bool openrisc_cpu_has_work(CPUState *cs) { - return cs->interrupt_request & (CPU_INTERRUPT_HARD | - CPU_INTERRUPT_TIMER); + return cpu_test_interrupt(cs, CPU_INTERRUPT_HARD | CPU_INTERRUPT_TIMER); } #endif /* !CONFIG_USER_ONLY */ diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c index a0e77f2673..db841f1260 100644 --- a/target/ppc/cpu_init.c +++ b/target/ppc/cpu_init.c @@ -7225,7 +7225,7 @@ static int ppc_cpu_mmu_index(CPUState *cs, bool ifetch) #ifndef CONFIG_USER_ONLY static bool ppc_cpu_has_work(CPUState *cs) { - return cs->interrupt_request & CPU_INTERRUPT_HARD; + return cpu_test_interrupt(cs, CPU_INTERRUPT_HARD); } #endif /* !CONFIG_USER_ONLY */ diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c index 015658049e..d145774b09 100644 --- a/target/ppc/kvm.c +++ b/target/ppc/kvm.c @@ -1354,7 +1354,7 @@ static int kvmppc_handle_halt(PowerPCCPU *cpu) CPUState *cs = CPU(cpu); CPUPPCState *env = &cpu->env; - if (!(cs->interrupt_request & CPU_INTERRUPT_HARD) && + if (!cpu_test_interrupt(cs, CPU_INTERRUPT_HARD) && FIELD_EX64(env->msr, MSR, EE)) { cs->halted = 1; cs->exception_index = EXCP_HLT; diff --git a/target/rx/cpu.c b/target/rx/cpu.c index c6dd5d6f83..da02ae7bf8 100644 --- a/target/rx/cpu.c +++ b/target/rx/cpu.c @@ -75,8 +75,7 @@ static void rx_restore_state_to_opc(CPUState *cs, static bool rx_cpu_has_work(CPUState *cs) { - return cs->interrupt_request & - (CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIR); + return cpu_test_interrupt(cs, CPU_INTERRUPT_HARD | CPU_INTERRUPT_FIR); } static int rx_cpu_mmu_index(CPUState *cs, bool ifunc) diff --git a/target/rx/helper.c b/target/rx/helper.c index 0640ab322b..ce003af421 100644 --- a/target/rx/helper.c +++ b/target/rx/helper.c @@ -44,7 +44,7 @@ void rx_cpu_unpack_psw(CPURXState *env, uint32_t psw, int rte) void rx_cpu_do_interrupt(CPUState *cs) { CPURXState *env = cpu_env(cs); - int do_irq = cs->interrupt_request & INT_FLAGS; + int do_irq = cpu_test_interrupt(cs, INT_FLAGS); uint32_t save_psw; env->in_sleep = 0; diff --git a/target/s390x/cpu-system.c b/target/s390x/cpu-system.c index 709ccd5299..f3a9ffb2a2 100644 --- a/target/s390x/cpu-system.c +++ b/target/s390x/cpu-system.c @@ -49,7 +49,7 @@ bool s390_cpu_has_work(CPUState *cs) return false; } - if (!(cs->interrupt_request & CPU_INTERRUPT_HARD)) { + if (!cpu_test_interrupt(cs, CPU_INTERRUPT_HARD)) { return false; } diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c index 4f561e8c91..21ccb86df4 100644 --- a/target/sh4/cpu.c +++ b/target/sh4/cpu.c @@ -108,7 +108,7 @@ static bool superh_io_recompile_replay_branch(CPUState *cs, static bool superh_cpu_has_work(CPUState *cs) { - return cs->interrupt_request & CPU_INTERRUPT_HARD; + return cpu_test_interrupt(cs, CPU_INTERRUPT_HARD); } #endif /* !CONFIG_USER_ONLY */ diff --git a/target/sh4/helper.c b/target/sh4/helper.c index fb7642bda1..1744ef0e6d 100644 --- a/target/sh4/helper.c +++ b/target/sh4/helper.c @@ -58,7 +58,7 @@ int cpu_sh4_is_cached(CPUSH4State *env, target_ulong addr) void superh_cpu_do_interrupt(CPUState *cs) { CPUSH4State *env = cpu_env(cs); - int do_irq = cs->interrupt_request & CPU_INTERRUPT_HARD; + int do_irq = cpu_test_interrupt(cs, CPU_INTERRUPT_HARD); int do_exp, irq_vector = cs->exception_index; /* prioritize exceptions over interrupts */ diff --git a/target/sparc/cpu.c b/target/sparc/cpu.c index 245caf2de0..c9773f1540 100644 --- a/target/sparc/cpu.c +++ b/target/sparc/cpu.c @@ -783,7 +783,7 @@ static void sparc_restore_state_to_opc(CPUState *cs, #ifndef CONFIG_USER_ONLY static bool sparc_cpu_has_work(CPUState *cs) { - return (cs->interrupt_request & CPU_INTERRUPT_HARD) && + return cpu_test_interrupt(cs, CPU_INTERRUPT_HARD) && cpu_interrupts_enabled(cpu_env(cs)); } #endif /* !CONFIG_USER_ONLY */ diff --git a/target/sparc/int64_helper.c b/target/sparc/int64_helper.c index bd14c7a0db..49e4e51c6d 100644 --- a/target/sparc/int64_helper.c +++ b/target/sparc/int64_helper.c @@ -89,7 +89,7 @@ void cpu_check_irqs(CPUSPARCState *env) * the next bit is (2 << psrpil). */ if (pil < (2 << env->psrpil)) { - if (cs->interrupt_request & CPU_INTERRUPT_HARD) { + if (cpu_test_interrupt(cs, CPU_INTERRUPT_HARD)) { trace_sparc64_cpu_check_irqs_reset_irq(env->interrupt_index); env->interrupt_index = 0; cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD); @@ -120,7 +120,7 @@ void cpu_check_irqs(CPUSPARCState *env) break; } } - } else if (cs->interrupt_request & CPU_INTERRUPT_HARD) { + } else if (cpu_test_interrupt(cs, CPU_INTERRUPT_HARD)) { trace_sparc64_cpu_check_irqs_disabled(pil, env->pil_in, env->softint, env->interrupt_index); env->interrupt_index = 0; From 73c520b088878682e2d3b7fa19a6366ec8d39829 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Thu, 14 Aug 2025 18:05:53 +0200 Subject: [PATCH 22/28] memory: reintroduce BQL-free fine-grained PIO/MMIO This patch brings back Jan's idea [1] of BQL-free IO access This will let us make access to ACPI PM/HPET timers cheaper, and prevent BQL contention in case of workload that heavily uses the timers with a lot of vCPUs. 1) 196ea13104f (memory: Add global-locking property to memory regions) ... de7ea885c539 (kvm: Switch to unlocked MMIO) Signed-off-by: Igor Mammedov Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20250814160600.2327672-2-imammedo@redhat.com Signed-off-by: Paolo Bonzini --- include/system/memory.h | 12 ++++++++++++ system/memory.c | 15 +++++++++++++++ system/physmem.c | 2 +- 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/include/system/memory.h b/include/system/memory.h index e2cd6ed126..aa85fc27a1 100644 --- a/include/system/memory.h +++ b/include/system/memory.h @@ -833,6 +833,7 @@ struct MemoryRegion { bool nonvolatile; bool rom_device; bool flush_coalesced_mmio; + bool lockless_io; bool unmergeable; uint8_t dirty_log_mask; bool is_iommu; @@ -2341,6 +2342,17 @@ void memory_region_set_flush_coalesced(MemoryRegion *mr); */ void memory_region_clear_flush_coalesced(MemoryRegion *mr); +/** + * memory_region_enable_lockless_io: Enable lockless (BQL free) acceess. + * + * Enable BQL-free access for devices that are well prepared to handle + * locking during I/O themselves: either by doing fine grained locking or + * by providing lock-free I/O schemes. + * + * @mr: the memory region to be updated. + */ +void memory_region_enable_lockless_io(MemoryRegion *mr); + /** * memory_region_add_eventfd: Request an eventfd to be triggered when a word * is written to a location. diff --git a/system/memory.c b/system/memory.c index 5646547940..44701c465c 100644 --- a/system/memory.c +++ b/system/memory.c @@ -2546,6 +2546,21 @@ void memory_region_clear_flush_coalesced(MemoryRegion *mr) } } +void memory_region_enable_lockless_io(MemoryRegion *mr) +{ + mr->lockless_io = true; + /* + * reentrancy_guard has per device scope, that when enabled + * will effectively prevent concurrent access to device's IO + * MemoryRegion(s) by not calling accessor callback. + * + * Turn it off for lock-less IO enabled devices, to allow + * concurrent IO. + * TODO: remove this when reentrancy_guard becomes per transaction. + */ + mr->disable_reentrancy_guard = true; +} + void memory_region_add_eventfd(MemoryRegion *mr, hwaddr addr, unsigned size, diff --git a/system/physmem.c b/system/physmem.c index e5dd760e0b..f498572fc8 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -2900,7 +2900,7 @@ bool prepare_mmio_access(MemoryRegion *mr) { bool release_lock = false; - if (!bql_locked()) { + if (!bql_locked() && !mr->lockless_io) { bql_lock(); release_lock = true; } From 4ae5e2b2bf65452538511a2895d7a4e2115058a5 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Thu, 14 Aug 2025 18:05:54 +0200 Subject: [PATCH 23/28] acpi: mark PMTIMER as unlocked Reading QEMU_CLOCK_VIRTUAL is thread-safe, write access is NOP. This makes possible to boot Windows with large vCPUs count when hv-time is not used. Reproducer: -M q35,hpet=off -cpu host -enable-kvm -smp 240,sockets=4 -m 8G WS2025.img fails to boot within 30min. With this fix it boots within 2-1min. Signed-off-by: Igor Mammedov Reviewed-by: Peter Xu Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20250814160600.2327672-3-imammedo@redhat.com Signed-off-by: Paolo Bonzini --- hw/acpi/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/acpi/core.c b/hw/acpi/core.c index 58f8964e13..ff16582803 100644 --- a/hw/acpi/core.c +++ b/hw/acpi/core.c @@ -547,6 +547,7 @@ void acpi_pm_tmr_init(ACPIREGS *ar, acpi_update_sci_fn update_sci, ar->tmr.timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, acpi_pm_tmr_timer, ar); memory_region_init_io(&ar->tmr.io, memory_region_owner(parent), &acpi_pm_tmr_ops, ar, "acpi-tmr", 4); + memory_region_enable_lockless_io(&ar->tmr.io); memory_region_add_subregion(parent, 8, &ar->tmr.io); } From 7defb58bafac8dcb23c06be5e4f2d1a33d8392fd Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Thu, 14 Aug 2025 18:05:55 +0200 Subject: [PATCH 24/28] hpet: switch to fine-grained device locking as a step towards lock-less HPET counter read, use per device locking instead of BQL. Signed-off-by: Igor Mammedov Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20250814160600.2327672-4-imammedo@redhat.com Signed-off-by: Paolo Bonzini --- hw/timer/hpet.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/timer/hpet.c b/hw/timer/hpet.c index cb48cc151f..ab5aa59ae4 100644 --- a/hw/timer/hpet.c +++ b/hw/timer/hpet.c @@ -38,6 +38,7 @@ #include "hw/timer/i8254.h" #include "system/address-spaces.h" #include "qom/object.h" +#include "qemu/lockable.h" #include "trace.h" struct hpet_fw_config hpet_fw_cfg = {.count = UINT8_MAX}; @@ -69,6 +70,7 @@ struct HPETState { SysBusDevice parent_obj; /*< public >*/ + QemuMutex lock; MemoryRegion iomem; uint64_t hpet_offset; bool hpet_offset_saved; @@ -428,6 +430,7 @@ static uint64_t hpet_ram_read(void *opaque, hwaddr addr, trace_hpet_ram_read(addr); addr &= ~4; + QEMU_LOCK_GUARD(&s->lock); /*address range of all global regs*/ if (addr <= 0xff) { switch (addr) { @@ -482,6 +485,7 @@ static void hpet_ram_write(void *opaque, hwaddr addr, int len = MIN(size * 8, 64 - shift); uint64_t old_val, new_val, cleared; + QEMU_LOCK_GUARD(&s->lock); trace_hpet_ram_write(addr, value); addr &= ~4; @@ -679,8 +683,10 @@ static void hpet_init(Object *obj) SysBusDevice *sbd = SYS_BUS_DEVICE(obj); HPETState *s = HPET(obj); + qemu_mutex_init(&s->lock); /* HPET Area */ memory_region_init_io(&s->iomem, obj, &hpet_ram_ops, s, "hpet", HPET_LEN); + memory_region_enable_lockless_io(&s->iomem); sysbus_init_mmio(sbd, &s->iomem); } From a453bf0354412592362139bdf4df0d4900ec0686 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Thu, 14 Aug 2025 18:05:56 +0200 Subject: [PATCH 25/28] hpet: move out main counter read into a separate block Follow up patche will switch main counter read to lock-less mode. As preparation for that move relevant branch into a separate top level block to make followup patch cleaner/simplier by reducing contextual noise when lock-less read is introduced. no functional changes. Signed-off-by: Igor Mammedov Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20250814160600.2327672-5-imammedo@redhat.com Signed-off-by: Paolo Bonzini --- hw/timer/hpet.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/hw/timer/hpet.c b/hw/timer/hpet.c index ab5aa59ae4..c776afc0f2 100644 --- a/hw/timer/hpet.c +++ b/hw/timer/hpet.c @@ -431,6 +431,16 @@ static uint64_t hpet_ram_read(void *opaque, hwaddr addr, addr &= ~4; QEMU_LOCK_GUARD(&s->lock); + if (addr == HPET_COUNTER) { + if (hpet_enabled(s)) { + cur_tick = hpet_get_ticks(s); + } else { + cur_tick = s->hpet_counter; + } + trace_hpet_ram_read_reading_counter(addr & 4, cur_tick); + return cur_tick >> shift; + } + /*address range of all global regs*/ if (addr <= 0xff) { switch (addr) { @@ -438,14 +448,6 @@ static uint64_t hpet_ram_read(void *opaque, hwaddr addr, return s->capability >> shift; case HPET_CFG: return s->config >> shift; - case HPET_COUNTER: - if (hpet_enabled(s)) { - cur_tick = hpet_get_ticks(s); - } else { - cur_tick = s->hpet_counter; - } - trace_hpet_ram_read_reading_counter(addr & 4, cur_tick); - return cur_tick >> shift; case HPET_STATUS: return s->isr >> shift; default: From 20c2345290f34aac434284cf9a242c7904d39a27 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Thu, 14 Aug 2025 18:05:57 +0200 Subject: [PATCH 26/28] hpet: make main counter read lock-less Make access to main HPET counter lock-less. In unlikely event of an update in progress, readers will busy wait untill update is finished. As result micro benchmark of concurrent reading of HPET counter with large number of vCPU shows over 80% better (less) latency. Signed-off-by: Igor Mammedov Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20250814160600.2327672-6-imammedo@redhat.com Signed-off-by: Paolo Bonzini --- hw/timer/hpet.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/hw/timer/hpet.c b/hw/timer/hpet.c index c776afc0f2..789a31d0a0 100644 --- a/hw/timer/hpet.c +++ b/hw/timer/hpet.c @@ -39,6 +39,7 @@ #include "system/address-spaces.h" #include "qom/object.h" #include "qemu/lockable.h" +#include "qemu/seqlock.h" #include "trace.h" struct hpet_fw_config hpet_fw_cfg = {.count = UINT8_MAX}; @@ -74,6 +75,7 @@ struct HPETState { MemoryRegion iomem; uint64_t hpet_offset; bool hpet_offset_saved; + QemuSeqLock state_version; qemu_irq irqs[HPET_NUM_IRQ_ROUTES]; uint32_t flags; uint8_t rtc_irq_level; @@ -430,17 +432,25 @@ static uint64_t hpet_ram_read(void *opaque, hwaddr addr, trace_hpet_ram_read(addr); addr &= ~4; - QEMU_LOCK_GUARD(&s->lock); if (addr == HPET_COUNTER) { - if (hpet_enabled(s)) { - cur_tick = hpet_get_ticks(s); - } else { - cur_tick = s->hpet_counter; - } + unsigned version; + + /* + * Write update is rare, so busywait here is unlikely to happen + */ + do { + version = seqlock_read_begin(&s->state_version); + if (unlikely(!hpet_enabled(s))) { + cur_tick = s->hpet_counter; + } else { + cur_tick = hpet_get_ticks(s); + } + } while (seqlock_read_retry(&s->state_version, version)); trace_hpet_ram_read_reading_counter(addr & 4, cur_tick); return cur_tick >> shift; } + QEMU_LOCK_GUARD(&s->lock); /*address range of all global regs*/ if (addr <= 0xff) { switch (addr) { @@ -500,6 +510,7 @@ static void hpet_ram_write(void *opaque, hwaddr addr, old_val = s->config; new_val = deposit64(old_val, shift, len, value); new_val = hpet_fixup_reg(new_val, old_val, HPET_CFG_WRITE_MASK); + seqlock_write_begin(&s->state_version); s->config = new_val; if (activating_bit(old_val, new_val, HPET_CFG_ENABLE)) { /* Enable main counter and interrupt generation. */ @@ -518,6 +529,8 @@ static void hpet_ram_write(void *opaque, hwaddr addr, hpet_del_timer(&s->timer[i]); } } + seqlock_write_end(&s->state_version); + /* i8254 and RTC output pins are disabled * when HPET is in legacy mode */ if (activating_bit(old_val, new_val, HPET_CFG_LEGACY)) { @@ -686,6 +699,7 @@ static void hpet_init(Object *obj) HPETState *s = HPET(obj); qemu_mutex_init(&s->lock); + seqlock_init(&s->state_version); /* HPET Area */ memory_region_init_io(&s->iomem, obj, &hpet_ram_ops, s, "hpet", HPET_LEN); memory_region_enable_lockless_io(&s->iomem); From 17e645c6f17999cd0306c4d18d6f6cb3db55756d Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Thu, 14 Aug 2025 18:05:59 +0200 Subject: [PATCH 27/28] kvm: i386: irqchip: take BQL only if there is an interrupt when kernel-irqchip=split is used, QEMU still hits BQL contention issue when reading ACPI PM/HPET timers (despite of timer[s] access being lock-less). So Windows with more than 255 cpus is still not able to boot (since it requires iommu -> split irqchip). Problematic path is in kvm_arch_pre_run() where BQL is taken unconditionally when split irqchip is in use. There are a few parts that BQL protects there: 1. interrupt check and injecting however we do not take BQL when checking for pending interrupt (even within the same function), so the patch takes the same approach for cpu->interrupt_request checks and takes BQL only if there is a job to do. 2. request_interrupt_window access CPUState::kvm_run::request_interrupt_window doesn't need BQL as it's accessed by its own vCPU thread. 3. cr8/cpu_get_apic_tpr access the same (as #2) applies to CPUState::kvm_run::cr8, and APIC registers are also cached/synced (get/put) within the vCPU thread it belongs to. Taking BQL only when is necessary, eleminates BQL bottleneck on IO/MMIO only exit path, improoving latency by 80% on HPET micro benchmark. This lets Windows to boot succesfully (in case hv-time isn't used) when more than 255 vCPUs are in use. Signed-off-by: Igor Mammedov Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20250814160600.2327672-8-imammedo@redhat.com Signed-off-by: Paolo Bonzini --- target/i386/kvm/kvm.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index a7b5c8f81b..306430a052 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -5478,9 +5478,6 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run) } } - if (!kvm_pic_in_kernel()) { - bql_lock(); - } /* Force the VCPU out of its inner loop to process any INIT requests * or (for userspace APIC, but it is cheap to combine the checks here) @@ -5489,10 +5486,10 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run) if (cpu_test_interrupt(cpu, CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { if (cpu_test_interrupt(cpu, CPU_INTERRUPT_INIT) && !(env->hflags & HF_SMM_MASK)) { - cpu->exit_request = 1; + qatomic_set(&cpu->exit_request, 1); } if (cpu_test_interrupt(cpu, CPU_INTERRUPT_TPR)) { - cpu->exit_request = 1; + qatomic_set(&cpu->exit_request, 1); } } @@ -5503,6 +5500,8 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run) (env->eflags & IF_MASK)) { int irq; + bql_lock(); + cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; irq = cpu_get_pic_interrupt(env); if (irq >= 0) { @@ -5517,6 +5516,7 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run) strerror(-ret)); } } + bql_unlock(); } /* If we have an interrupt but the guest is not ready to receive an @@ -5531,8 +5531,6 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run) DPRINTF("setting tpr\n"); run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state); - - bql_unlock(); } } From 83bd8e65bc70cef03a207df315004f8b1301dc53 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Thu, 14 Aug 2025 18:06:00 +0200 Subject: [PATCH 28/28] tcg: move interrupt caching and single step masking closer to user in cpu_handle_interrupt() the only place where cached interrupt_request might have effect is when CPU_INTERRUPT_SSTEP_MASK applied and cached interrupt_request handed over to cpu_exec_interrupt() and need_replay_interrupt(). Simplify code by moving interrupt_request caching and CPU_INTERRUPT_SSTEP_MASK masking into the block where it actually matters and drop reloading cached value from CPUState:interrupt_request as the rest of the code directly uses CPUState:interrupt_request. Signed-off-by: Igor Mammedov Link: https://lore.kernel.org/r/20250814160600.2327672-9-imammedo@redhat.com Signed-off-by: Paolo Bonzini --- accel/tcg/cpu-exec.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index 96c124aa72..8491e5badd 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -782,13 +782,7 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, assert(!cpu_test_interrupt(cpu, ~0)); #else if (unlikely(cpu_test_interrupt(cpu, ~0))) { - int interrupt_request; bql_lock(); - interrupt_request = cpu->interrupt_request; - if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) { - /* Mask out external interrupts for this step. */ - interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK; - } if (cpu_test_interrupt(cpu, CPU_INTERRUPT_DEBUG)) { cpu->interrupt_request &= ~CPU_INTERRUPT_DEBUG; cpu->exception_index = EXCP_DEBUG; @@ -806,6 +800,7 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, return true; } else { const TCGCPUOps *tcg_ops = cpu->cc->tcg_ops; + int interrupt_request = cpu->interrupt_request; if (cpu_test_interrupt(cpu, CPU_INTERRUPT_RESET)) { replay_interrupt(); @@ -814,6 +809,11 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, return true; } + if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) { + /* Mask out external interrupts for this step. */ + interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK; + } + /* * The target hook has 3 exit conditions: * False when the interrupt isn't processed, @@ -838,9 +838,6 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, cpu->exception_index = -1; *last_tb = NULL; } - /* The target hook may have updated the 'cpu->interrupt_request'; - * reload the 'interrupt_request' value */ - interrupt_request = cpu->interrupt_request; } if (cpu_test_interrupt(cpu, CPU_INTERRUPT_EXITTB)) { cpu->interrupt_request &= ~CPU_INTERRUPT_EXITTB;