From faccaa34f0cb75b93ff1f4bcdd2f40f23f6aa10f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 30 Sep 2025 13:59:20 +0200 Subject: [PATCH 01/35] subprojects: Remove version number from .gitignore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Get rid of all the version numbers, and use wildcard matches instead, because peopl will repeatedly forgot to change these versions. Suggested-by: Manos Pitsidianakis Suggested-by: Daniel P. Berrangé Suggested-by: Philippe Mathieu-Daudé Signed-off-by: Paolo Bonzini --- subprojects/.gitignore | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/subprojects/.gitignore b/subprojects/.gitignore index 58a29f0120..0b5d963409 100644 --- a/subprojects/.gitignore +++ b/subprojects/.gitignore @@ -6,21 +6,21 @@ /keycodemapdb /libvfio-user /slirp -/anyhow-1.0.98 -/arbitrary-int-1.2.7 -/attrs-0.2.9 -/bilge-0.2.0 -/bilge-impl-0.2.0 -/either-1.12.0 -/foreign-0.3.1 -/itertools-0.11.0 -/libc-0.2.162 -/proc-macro-error-1.0.4 -/proc-macro-error-attr-1.0.4 -/proc-macro2-1.0.95 -/quote-1.0.36 -/syn-2.0.66 -/unicode-ident-1.0.12 +/anyhow-* +/arbitrary-int-* +/attrs-* +/bilge-* +/bilge-impl-* +/either-* +/foreign-* +/itertools-* +/libc-* +/proc-macro-error-* +/proc-macro-error-attr-* +/proc-macro* +/quote-* +/syn-* +/unicode-ident-* # Workaround for Meson v1.9.0 https://github.com/mesonbuild/meson/issues/14948 /.wraplock From 4c18783a88726c3fc39cea9117b720adb82e557f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Wed, 1 Oct 2025 08:52:32 +0200 Subject: [PATCH 02/35] subprojects: add glib-sys-rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marc-André Lureau Signed-off-by: Paolo Bonzini --- rust/meson.build | 2 ++ scripts/archive-source.sh | 1 + subprojects/.gitignore | 1 + subprojects/glib-sys-0.21-rs.wrap | 7 ++++ .../packagefiles/glib-sys-0.21-rs/meson.build | 33 +++++++++++++++++++ 5 files changed, 44 insertions(+) create mode 100644 subprojects/glib-sys-0.21-rs.wrap create mode 100644 subprojects/packagefiles/glib-sys-0.21-rs/meson.build diff --git a/rust/meson.build b/rust/meson.build index 695d5a62de..6ba075c8c7 100644 --- a/rust/meson.build +++ b/rust/meson.build @@ -2,12 +2,14 @@ subproject('anyhow-1-rs', required: true) subproject('bilge-0.2-rs', required: true) subproject('bilge-impl-0.2-rs', required: true) subproject('foreign-0.3-rs', required: true) +subproject('glib-sys-0.21-rs', required: true) subproject('libc-0.2-rs', required: true) anyhow_rs = dependency('anyhow-1-rs') bilge_rs = dependency('bilge-0.2-rs') bilge_impl_rs = dependency('bilge-impl-0.2-rs') foreign_rs = dependency('foreign-0.3-rs') +glib_sys_rs = dependency('glib-sys-0.21-rs') libc_rs = dependency('libc-0.2-rs') subproject('proc-macro2-1-rs', required: true) diff --git a/scripts/archive-source.sh b/scripts/archive-source.sh index a725dd923d..8f97b19a08 100755 --- a/scripts/archive-source.sh +++ b/scripts/archive-source.sh @@ -36,6 +36,7 @@ subprojects=( bilge-impl-0.2-rs either-1-rs foreign-0.3-rs + glib-sys-0.21-rs itertools-0.11-rs keycodemapdb libc-0.2-rs diff --git a/subprojects/.gitignore b/subprojects/.gitignore index 0b5d963409..c00c847837 100644 --- a/subprojects/.gitignore +++ b/subprojects/.gitignore @@ -13,6 +13,7 @@ /bilge-impl-* /either-* /foreign-* +/glib-sys-* /itertools-* /libc-* /proc-macro-error-* diff --git a/subprojects/glib-sys-0.21-rs.wrap b/subprojects/glib-sys-0.21-rs.wrap new file mode 100644 index 0000000000..313ced731a --- /dev/null +++ b/subprojects/glib-sys-0.21-rs.wrap @@ -0,0 +1,7 @@ +[wrap-file] +directory = glib-sys-0.21.2 +source_url = https://crates.io/api/v1/crates/glib-sys/0.21.2/download +source_filename = glib-sys-0.21.2.tar.gz +source_hash = d09d3d0fddf7239521674e57b0465dfbd844632fec54f059f7f56112e3f927e1 +#method = cargo +patch_directory = glib-sys-0.21-rs diff --git a/subprojects/packagefiles/glib-sys-0.21-rs/meson.build b/subprojects/packagefiles/glib-sys-0.21-rs/meson.build new file mode 100644 index 0000000000..8c5483311e --- /dev/null +++ b/subprojects/packagefiles/glib-sys-0.21-rs/meson.build @@ -0,0 +1,33 @@ +project('glib-sys-0.21-rs', 'rust', + meson_version: '>=1.5.0', + version: '0.21.2', + license: 'MIT', + default_options: []) + +subproject('libc-0.2-rs', required: true) +libc_rs = dependency('libc-0.2-rs') + +_glib_sys_rs = static_library( + 'glib_sys', + files('src/lib.rs'), + gnu_symbol_visibility: 'hidden', + override_options: ['rust_std=2021', 'build.rust_std=2021'], + rust_abi: 'rust', + rust_args: [ + '--cap-lints', 'allow', + '--cfg', 'feature="v2_66"', + '--cfg', 'feature="v2_64"', + '--cfg', 'feature="v2_62"', + '--cfg', 'feature="v2_60"', + '--cfg', 'feature="v2_58"', + ], + # should also link with glib; don't bother doing it here since all + # QEMU targets have it + dependencies: [libc_rs], +) + +glib_sys_dep = declare_dependency( + link_with: _glib_sys_rs, +) + +meson.override_dependency('glib-sys-0.21-rs', glib_sys_dep) From 9c40c1ff970d7ec3d4456aeb65245d66e1226118 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Wed, 24 Sep 2025 17:56:38 +0400 Subject: [PATCH 03/35] rust: use glib-sys MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't generate FFI for glib, rely on glib-sys crate. Signed-off-by: Marc-André Lureau Signed-off-by: Paolo Bonzini --- include/hw/core/cpu.h | 2 +- meson.build | 1 + rust/Cargo.lock | 180 +++++++++++++++++++++++++++++ rust/Cargo.toml | 1 + rust/bql/Cargo.toml | 1 + rust/bql/meson.build | 1 + rust/bql/src/bindings.rs | 4 + rust/chardev/Cargo.toml | 1 + rust/chardev/meson.build | 2 +- rust/chardev/src/bindings.rs | 4 + rust/hw/char/pl011/Cargo.toml | 1 + rust/hw/char/pl011/meson.build | 1 + rust/hw/char/pl011/src/bindings.rs | 5 + rust/hw/core/Cargo.toml | 1 + rust/hw/core/meson.build | 2 +- rust/hw/core/src/bindings.rs | 3 + rust/migration/Cargo.toml | 1 + rust/migration/meson.build | 2 +- rust/migration/src/bindings.rs | 1 + rust/qom/Cargo.toml | 1 + rust/qom/meson.build | 2 +- rust/qom/src/bindings.rs | 2 + rust/system/Cargo.toml | 1 + rust/system/meson.build | 2 +- rust/system/src/bindings.rs | 4 + rust/util/Cargo.toml | 1 + rust/util/meson.build | 2 +- rust/util/src/bindings.rs | 2 + 28 files changed, 224 insertions(+), 7 deletions(-) diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index c9f40c2539..4196293ba1 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -169,7 +169,7 @@ struct CPUClass { vaddr (*gdb_adjust_breakpoint)(CPUState *cpu, vaddr addr); const char *gdb_core_xml_file; - const gchar * (*gdb_arch_name)(CPUState *cpu); + const char * (*gdb_arch_name)(CPUState *cpu); const char * (*gdb_get_core_xml_file)(CPUState *cpu); void (*disas_set_info)(CPUState *cpu, disassemble_info *info); diff --git a/meson.build b/meson.build index 55c8202a4d..62766c0f19 100644 --- a/meson.build +++ b/meson.build @@ -4239,6 +4239,7 @@ if have_rust '--no-prepend-enum-name', '--allowlist-file', meson.project_source_root() + '/include/.*', '--allowlist-file', meson.project_build_root() + '/.*', + '--blocklist-file', glib_pc.get_variable('includedir') + '/glib-2.0/.*', ] if not rustfmt.found() if bindgen.version().version_compare('<0.65.0') diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 444ef516a7..1108513349 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -58,15 +58,27 @@ dependencies = [ name = "bql" version = "0.1.0" dependencies = [ + "glib-sys", "migration", ] +[[package]] +name = "cfg-expr" +version = "0.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a2c5f3bf25ec225351aa1c8e230d04d880d3bd89dea133537dafad4ae291e5c" +dependencies = [ + "smallvec", + "target-lexicon", +] + [[package]] name = "chardev" version = "0.1.0" dependencies = [ "bql", "common", + "glib-sys", "migration", "qom", "util", @@ -86,6 +98,12 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + [[package]] name = "foreign" version = "0.3.1" @@ -95,6 +113,28 @@ dependencies = [ "libc", ] +[[package]] +name = "glib-sys" +version = "0.21.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d09d3d0fddf7239521674e57b0465dfbd844632fec54f059f7f56112e3f927e1" +dependencies = [ + "libc", + "system-deps", +] + +[[package]] +name = "hashbrown" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hpet" version = "0.1.0" @@ -115,6 +155,7 @@ dependencies = [ "bql", "chardev", "common", + "glib-sys", "migration", "qemu_macros", "qom", @@ -122,6 +163,16 @@ dependencies = [ "util", ] +[[package]] +name = "indexmap" +version = "2.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5" +dependencies = [ + "equivalent", + "hashbrown", +] + [[package]] name = "itertools" version = "0.11.0" @@ -137,14 +188,27 @@ version = "0.2.162" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18d287de67fe55fd7e1581fe933d965a5a9477b38e949cfa9f8574ef01506398" +[[package]] +name = "memchr" +version = "2.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" + [[package]] name = "migration" version = "0.1.0" dependencies = [ "common", + "glib-sys", "util", ] +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + [[package]] name = "pl011" version = "0.1.0" @@ -155,6 +219,7 @@ dependencies = [ "bql", "chardev", "common", + "glib-sys", "hwcore", "migration", "qom", @@ -211,6 +276,7 @@ version = "0.1.0" dependencies = [ "bql", "common", + "glib-sys", "migration", "qemu_macros", "util", @@ -225,6 +291,50 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "serde" +version = "1.0.226" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dca6411025b24b60bfa7ec1fe1f8e710ac09782dca409ee8237ba74b51295fd" +dependencies = [ + "serde_core", +] + +[[package]] +name = "serde_core" +version = "1.0.226" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba2ba63999edb9dac981fb34b3e5c0d111a69b0924e253ed29d83f7c99e966a4" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.226" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8db53ae22f34573731bafa1db20f04027b2d25e02d8205921b569171699cdb33" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + [[package]] name = "syn" version = "2.0.104" @@ -241,10 +351,30 @@ name = "system" version = "0.1.0" dependencies = [ "common", + "glib-sys", "qom", "util", ] +[[package]] +name = "system-deps" +version = "7.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4be53aa0cba896d2dc615bd42bbc130acdcffa239e0a2d965ea5b3b2a86ffdb" +dependencies = [ + "cfg-expr", + "heck", + "pkg-config", + "toml", + "version-compare", +] + +[[package]] +name = "target-lexicon" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" + [[package]] name = "tests" version = "0.1.0" @@ -259,6 +389,40 @@ dependencies = [ "util", ] +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "winnow", +] + [[package]] name = "trace" version = "0.1.0" @@ -279,11 +443,27 @@ dependencies = [ "anyhow", "common", "foreign", + "glib-sys", "libc", ] +[[package]] +name = "version-compare" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "852e951cb7832cb45cb1169900d19760cfa39b82bc0ea9c0e5a14ae88411c98b" + [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "winnow" +version = "0.7.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf" +dependencies = [ + "memchr", +] diff --git a/rust/Cargo.toml b/rust/Cargo.toml index f372d7dbf7..783e626802 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -29,6 +29,7 @@ authors = ["The QEMU Project Developers "] anyhow = "~1.0" foreign = "~0.3.1" libc = "0.2.162" +glib-sys = { version = "0.21.2", features = ["v2_66"] } [workspace.lints.rust] unexpected_cfgs = { level = "deny", check-cfg = ['cfg(MESON)'] } diff --git a/rust/bql/Cargo.toml b/rust/bql/Cargo.toml index 1041bd4ea9..d5177e5f8e 100644 --- a/rust/bql/Cargo.toml +++ b/rust/bql/Cargo.toml @@ -14,6 +14,7 @@ rust-version.workspace = true [dependencies] migration = { path = "../migration" } +glib-sys.workspace = true [features] default = ["debug_cell"] diff --git a/rust/bql/meson.build b/rust/bql/meson.build index bc51c7f160..22d7c9b877 100644 --- a/rust/bql/meson.build +++ b/rust/bql/meson.build @@ -38,6 +38,7 @@ _bql_rs = static_library( rust_abi: 'rust', rust_args: _bql_cfg, link_with: [_migration_rs], + dependencies: [glib_sys_rs], ) bql_rs = declare_dependency(link_with: [_bql_rs], diff --git a/rust/bql/src/bindings.rs b/rust/bql/src/bindings.rs index 9ffff12cde..8c70f3a87c 100644 --- a/rust/bql/src/bindings.rs +++ b/rust/bql/src/bindings.rs @@ -18,6 +18,10 @@ clippy::too_many_arguments )] +use glib_sys::{ + guint, GArray, GHashTable, GHashTableIter, GList, GPollFD, GPtrArray, GQueue, GSList, GSource, +}; + #[cfg(MESON)] include!("bindings.inc.rs"); diff --git a/rust/chardev/Cargo.toml b/rust/chardev/Cargo.toml index 3e77972546..f105189dcc 100644 --- a/rust/chardev/Cargo.toml +++ b/rust/chardev/Cargo.toml @@ -13,6 +13,7 @@ repository.workspace = true rust-version.workspace = true [dependencies] +glib-sys = { workspace = true } common = { path = "../common" } bql = { path = "../bql" } migration = { path = "../migration" } diff --git a/rust/chardev/meson.build b/rust/chardev/meson.build index e7ce02b3bc..d365d8dd0f 100644 --- a/rust/chardev/meson.build +++ b/rust/chardev/meson.build @@ -36,7 +36,7 @@ _chardev_rs = static_library( override_options: ['rust_std=2021', 'build.rust_std=2021'], rust_abi: 'rust', link_with: [_bql_rs, _migration_rs, _qom_rs, _util_rs], - dependencies: [common_rs, qemu_macros], + dependencies: [glib_sys_rs, common_rs, qemu_macros], ) chardev_rs = declare_dependency(link_with: [_chardev_rs], dependencies: [chardev, qemuutil]) diff --git a/rust/chardev/src/bindings.rs b/rust/chardev/src/bindings.rs index 2d98026d62..c95dc89c56 100644 --- a/rust/chardev/src/bindings.rs +++ b/rust/chardev/src/bindings.rs @@ -19,6 +19,10 @@ )] use common::Zeroable; +use glib_sys::{ + gboolean, guint, GArray, GHashTable, GHashTableIter, GIOCondition, GList, GMainContext, + GPollFD, GPtrArray, GQueue, GSList, GSource, GSourceFunc, +}; #[cfg(MESON)] include!("bindings.inc.rs"); diff --git a/rust/hw/char/pl011/Cargo.toml b/rust/hw/char/pl011/Cargo.toml index dc41d0e499..5b319455ee 100644 --- a/rust/hw/char/pl011/Cargo.toml +++ b/rust/hw/char/pl011/Cargo.toml @@ -13,6 +13,7 @@ repository.workspace = true rust-version.workspace = true [dependencies] +glib-sys.workspace = true bilge = { version = "0.2.0" } bilge-impl = { version = "0.2.0" } bits = { path = "../../../bits" } diff --git a/rust/hw/char/pl011/meson.build b/rust/hw/char/pl011/meson.build index 07b3da17e8..33b91f2191 100644 --- a/rust/hw/char/pl011/meson.build +++ b/rust/hw/char/pl011/meson.build @@ -33,6 +33,7 @@ _libpl011_rs = static_library( bilge_impl_rs, bits_rs, common_rs, + glib_sys_rs, util_rs, migration_rs, bql_rs, diff --git a/rust/hw/char/pl011/src/bindings.rs b/rust/hw/char/pl011/src/bindings.rs index bd5ea840cb..52a76d0de5 100644 --- a/rust/hw/char/pl011/src/bindings.rs +++ b/rust/hw/char/pl011/src/bindings.rs @@ -20,6 +20,11 @@ //! `bindgen`-generated declarations. +use glib_sys::{ + gboolean, guint, GArray, GByteArray, GHashTable, GHashTableIter, GIOCondition, GList, + GMainContext, GPollFD, GPtrArray, GQueue, GSList, GSource, GSourceFunc, GString, +}; + #[cfg(MESON)] include!("bindings.inc.rs"); diff --git a/rust/hw/core/Cargo.toml b/rust/hw/core/Cargo.toml index 9a9aa51708..ecfb564718 100644 --- a/rust/hw/core/Cargo.toml +++ b/rust/hw/core/Cargo.toml @@ -13,6 +13,7 @@ repository.workspace = true rust-version.workspace = true [dependencies] +glib-sys.workspace = true qemu_macros = { path = "../../qemu-macros" } common = { path = "../../common" } bql = { path = "../../bql" } diff --git a/rust/hw/core/meson.build b/rust/hw/core/meson.build index e1ae95ed61..1560dd20c6 100644 --- a/rust/hw/core/meson.build +++ b/rust/hw/core/meson.build @@ -59,7 +59,7 @@ _hwcore_rs = static_library( override_options: ['rust_std=2021', 'build.rust_std=2021'], rust_abi: 'rust', link_with: [_bql_rs, _chardev_rs, _migration_rs, _qom_rs, _system_rs, _util_rs], - dependencies: [qemu_macros, common_rs], + dependencies: [glib_sys_rs, qemu_macros, common_rs], ) hwcore_rs = declare_dependency(link_with: [_hwcore_rs], diff --git a/rust/hw/core/src/bindings.rs b/rust/hw/core/src/bindings.rs index 919c02b56a..65b9aae753 100644 --- a/rust/hw/core/src/bindings.rs +++ b/rust/hw/core/src/bindings.rs @@ -20,6 +20,9 @@ use chardev::bindings::Chardev; use common::Zeroable; +use glib_sys::{ + GArray, GByteArray, GHashTable, GHashTableIter, GList, GPtrArray, GQueue, GSList, GString, +}; use migration::bindings::VMStateDescription; use qom::bindings::ObjectClass; use system::bindings::MemoryRegion; diff --git a/rust/migration/Cargo.toml b/rust/migration/Cargo.toml index 708bfaaa68..94504f3625 100644 --- a/rust/migration/Cargo.toml +++ b/rust/migration/Cargo.toml @@ -15,6 +15,7 @@ rust-version.workspace = true [dependencies] common = { path = "../common" } util = { path = "../util" } +glib-sys.workspace = true [lints] workspace = true diff --git a/rust/migration/meson.build b/rust/migration/meson.build index ddf5c2f51d..18be65c92c 100644 --- a/rust/migration/meson.build +++ b/rust/migration/meson.build @@ -38,7 +38,7 @@ _migration_rs = static_library( override_options: ['rust_std=2021', 'build.rust_std=2021'], rust_abi: 'rust', link_with: [_util_rs], - dependencies: [common_rs], + dependencies: [common_rs, glib_sys_rs], ) migration_rs = declare_dependency(link_with: [_migration_rs], diff --git a/rust/migration/src/bindings.rs b/rust/migration/src/bindings.rs index 8ce13a9000..24503eb69b 100644 --- a/rust/migration/src/bindings.rs +++ b/rust/migration/src/bindings.rs @@ -19,6 +19,7 @@ )] use common::Zeroable; +use glib_sys::{GHashTable, GHashTableIter, GList, GPtrArray, GQueue, GSList}; #[cfg(MESON)] include!("bindings.inc.rs"); diff --git a/rust/qom/Cargo.toml b/rust/qom/Cargo.toml index 060ad2ec34..4be3c2541b 100644 --- a/rust/qom/Cargo.toml +++ b/rust/qom/Cargo.toml @@ -18,6 +18,7 @@ bql = { path = "../bql" } migration = { path = "../migration" } qemu_macros = { path = "../qemu-macros" } util = { path = "../util" } +glib-sys.workspace = true [lints] workspace = true diff --git a/rust/qom/meson.build b/rust/qom/meson.build index 71fdac696c..e50f41858d 100644 --- a/rust/qom/meson.build +++ b/rust/qom/meson.build @@ -29,7 +29,7 @@ _qom_rs = static_library( override_options: ['rust_std=2021', 'build.rust_std=2021'], rust_abi: 'rust', link_with: [_bql_rs, _migration_rs], - dependencies: [common_rs, qemu_macros], + dependencies: [common_rs, glib_sys_rs, qemu_macros], ) qom_rs = declare_dependency(link_with: [_qom_rs], dependencies: [qemu_macros, qom]) diff --git a/rust/qom/src/bindings.rs b/rust/qom/src/bindings.rs index 9ffff12cde..91de42f242 100644 --- a/rust/qom/src/bindings.rs +++ b/rust/qom/src/bindings.rs @@ -18,6 +18,8 @@ clippy::too_many_arguments )] +use glib_sys::{GHashTable, GHashTableIter, GList, GPtrArray, GQueue, GSList}; + #[cfg(MESON)] include!("bindings.inc.rs"); diff --git a/rust/system/Cargo.toml b/rust/system/Cargo.toml index 7fd369b9e3..186ea00bff 100644 --- a/rust/system/Cargo.toml +++ b/rust/system/Cargo.toml @@ -16,6 +16,7 @@ rust-version.workspace = true common = { path = "../common" } qom = { path = "../qom" } util = { path = "../util" } +glib-sys.workspace = true [lints] workspace = true diff --git a/rust/system/meson.build b/rust/system/meson.build index 0859f39745..73d6199114 100644 --- a/rust/system/meson.build +++ b/rust/system/meson.build @@ -36,7 +36,7 @@ _system_rs = static_library( override_options: ['rust_std=2021', 'build.rust_std=2021'], rust_abi: 'rust', link_with: [_bql_rs, _migration_rs, _qom_rs, _util_rs], - dependencies: [common_rs, qemu_macros], + dependencies: [glib_sys_rs, common_rs, qemu_macros], ) system_rs = declare_dependency(link_with: [_system_rs], diff --git a/rust/system/src/bindings.rs b/rust/system/src/bindings.rs index 43edd98807..6cbb588de3 100644 --- a/rust/system/src/bindings.rs +++ b/rust/system/src/bindings.rs @@ -19,6 +19,10 @@ )] use common::Zeroable; +use glib_sys::{ + guint, GArray, GByteArray, GHashTable, GHashTableIter, GList, GPollFD, GPtrArray, GQueue, + GSList, GString, +}; #[cfg(MESON)] include!("bindings.inc.rs"); diff --git a/rust/util/Cargo.toml b/rust/util/Cargo.toml index 1f6767ed9d..85f9143654 100644 --- a/rust/util/Cargo.toml +++ b/rust/util/Cargo.toml @@ -15,6 +15,7 @@ rust-version.workspace = true [dependencies] anyhow = { workspace = true } foreign = { workspace = true } +glib-sys = { workspace = true } libc = { workspace = true } common = { path = "../common" } diff --git a/rust/util/meson.build b/rust/util/meson.build index 094b43355a..b0b75e93ff 100644 --- a/rust/util/meson.build +++ b/rust/util/meson.build @@ -40,7 +40,7 @@ _util_rs = static_library( ), override_options: ['rust_std=2021', 'build.rust_std=2021'], rust_abi: 'rust', - dependencies: [anyhow_rs, libc_rs, foreign_rs, common_rs, qom, qemuutil], + dependencies: [anyhow_rs, libc_rs, foreign_rs, glib_sys_rs, common_rs, qom, qemuutil], ) util_rs = declare_dependency(link_with: [_util_rs], dependencies: [qemuutil, qom]) diff --git a/rust/util/src/bindings.rs b/rust/util/src/bindings.rs index 9ffff12cde..c277a295ad 100644 --- a/rust/util/src/bindings.rs +++ b/rust/util/src/bindings.rs @@ -18,6 +18,8 @@ clippy::too_many_arguments )] +use glib_sys::{guint, GList, GPollFD, GQueue, GSList, GString}; + #[cfg(MESON)] include!("bindings.inc.rs"); From 2438f18ee01108ead65be81be056e5a5da69120c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Tue, 7 Oct 2025 17:45:58 +0400 Subject: [PATCH 04/35] build-sys: default to host vendor for rust target triple MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes docker-test@alpine, which uses "alpine" vendor. Signed-off-by: Marc-André Lureau Link: https://lore.kernel.org/r/20251007134558.251670-1-marcandre.lureau@redhat.com Signed-off-by: Paolo Bonzini --- configure | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/configure b/configure index 78445cbb4b..a467f3a2e0 100755 --- a/configure +++ b/configure @@ -1216,8 +1216,9 @@ fi if test "$rust" != disabled && test -z "$rust_target_triple"; then # arch and os generally matches between meson and rust rust_arch=$host_arch + # default to host vendor + rust_vendor=$(echo "$rust_host_triple" | cut -d'-' -f2) rust_os=$host_os - rust_machine=unknown rust_osvariant= # tweak rust_os if needed; also, machine and variant depend on the OS @@ -1225,7 +1226,7 @@ if test "$rust" != disabled && test -z "$rust_target_triple"; then case "$host_os" in darwin) # e.g. aarch64-apple-darwin - rust_machine=apple + rust_vendor=apple ;; linux) @@ -1273,13 +1274,13 @@ EOF ;; sunos) - rust_machine=pc + rust_vendor=pc rust_os=solaris ;; windows) # e.g. aarch64-pc-windows-gnullvm, x86_64-pc-windows-gnu (MSVC not supported) - rust_machine=pc + rust_vendor=pc if test "$host_arch" = aarch64; then rust_osvariant=gnullvm else @@ -1310,7 +1311,7 @@ EOF sparc64) if test "$rust_os" = solaris; then rust_arch=sparcv9 - rust_machine=sun + rust_vendor=sun fi ;; @@ -1324,7 +1325,7 @@ EOF # e.g. aarch64-linux-android rust_target_triple=$rust_arch-$rust_os-$rust_osvariant else - rust_target_triple=$rust_arch-$rust_machine-$rust_os${rust_osvariant:+-$rust_osvariant} + rust_target_triple=$rust_arch-$rust_vendor-$rust_os${rust_osvariant:+-$rust_osvariant} fi fi From e9efa4a77168ac2816bf9471f878252ce6224710 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 23 Sep 2025 12:22:54 +0200 Subject: [PATCH 05/35] target/i386: add compatibility property for arch_capabilities Prior to v10.1, if requested by user, arch-capabilities is always on despite the fact that CPUID advertises it to be off/unvailable. This causes a migration issue for VMs that are run on a machine without arch-capabilities and expect this feature to be present on the destination host with QEMU 10.1. Add a compatibility property to restore the legacy behavior for all machines with version prior to 10.1. To preserve the functionality (added by 10.1) of turning off ARCH_CAPABILITIES where Windows does not like it, use directly the guest CPU vendor: x86_cpu_get_supported_feature_word is not KVM-specific and therefore should not necessarily use the host CPUID. Co-authored-by: Hector Cao Signed-off-by: Hector Cao Fixes: d3a24134e37 ("target/i386: do not expose ARCH_CAPABILITIES on AMD CPU", 2025-07-17) Signed-off-by: Paolo Bonzini --- hw/i386/pc.c | 1 + target/i386/cpu.c | 17 +++++++++++++++++ target/i386/cpu.h | 6 ++++++ target/i386/kvm/kvm.c | 6 +----- 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index bc048a6d13..d7f48150fd 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -87,6 +87,7 @@ const size_t pc_compat_10_1_len = G_N_ELEMENTS(pc_compat_10_1); GlobalProperty pc_compat_10_0[] = { { TYPE_X86_CPU, "x-consistent-cache", "false" }, { TYPE_X86_CPU, "x-vendor-cpuid-only-v2", "false" }, + { TYPE_X86_CPU, "x-arch-cap-always-on", "true" }, }; const size_t pc_compat_10_0_len = G_N_ELEMENTS(pc_compat_10_0); diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 6d85149e6e..fe369bb128 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -7539,6 +7539,20 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w) #endif break; + case FEAT_7_0_EDX: + /* + * Windows does not like ARCH_CAPABILITIES on AMD machines at all. + * Do not show the fake ARCH_CAPABILITIES MSR that KVM sets up, + * except if needed for migration. + * + * When arch_cap_always_on is removed, this tweak can move to + * kvm_arch_get_supported_cpuid. + */ + if (cpu && IS_AMD_CPU(&cpu->env) && !cpu->arch_cap_always_on) { + unavail = CPUID_7_0_EDX_ARCH_CAPABILITIES; + } + break; + default: break; } @@ -10004,6 +10018,9 @@ static const Property x86_cpu_properties[] = { true), DEFINE_PROP_BOOL("x-l1-cache-per-thread", X86CPU, l1_cache_per_core, true), DEFINE_PROP_BOOL("x-force-cpuid-0x1f", X86CPU, force_cpuid_0x1f, false), + + DEFINE_PROP_BOOL("x-arch-cap-always-on", X86CPU, + arch_cap_always_on, false), }; #ifndef CONFIG_USER_ONLY diff --git a/target/i386/cpu.h b/target/i386/cpu.h index e0be7a7406..414ca968e8 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -2314,6 +2314,12 @@ struct ArchCPU { /* Forcefully disable KVM PV features not exposed in guest CPUIDs */ bool kvm_pv_enforce_cpuid; + /* + * Expose arch-capabilities unconditionally even on AMD models, for backwards + * compatibility with QEMU <10.1. + */ + bool arch_cap_always_on; + /* Number of physical address bits supported */ uint32_t phys_bits; diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 6a3a1c1ed8..db40caa341 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -503,12 +503,8 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, * Linux v4.17-v4.20 incorrectly return ARCH_CAPABILITIES on SVM hosts. * We can detect the bug by checking if MSR_IA32_ARCH_CAPABILITIES is * returned by KVM_GET_MSR_INDEX_LIST. - * - * But also, because Windows does not like ARCH_CAPABILITIES on AMD - * mcahines at all, do not show the fake ARCH_CAPABILITIES MSR that - * KVM sets up. */ - if (!has_msr_arch_capabs || !(edx & CPUID_7_0_EDX_ARCH_CAPABILITIES)) { + if (!has_msr_arch_capabs) { ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES; } } else if (function == 7 && index == 1 && reg == R_EAX) { From 6529f31e0dccadb532c80b36e3efe7aef83f9cad Mon Sep 17 00:00:00 2001 From: Hector Cao Date: Tue, 23 Sep 2025 12:16:41 +0200 Subject: [PATCH 06/35] target/i386: add compatibility property for pdcm feature The pdcm feature is supposed to be disabled when PMU is not available. Up until v10.1, pdcm feature is enabled even when PMU is off. This behavior has been fixed but this change breaks the migration of VMs that are run with QEMU < 10.0 and expect the pdcm feature to be enabled on the destination host. This commit restores the legacy behavior for machines with version prior to 10.1 to allow the migration from older QEMU to QEMU 10.1. Signed-off-by: Hector Cao Link: https://lore.kernel.org/r/20250910115733.21149-3-hector.cao@canonical.com Fixes: e68ec298090 ("i386/cpu: Move adjustment of CPUID_EXT_PDCM before feature_dependencies[] check", 2025-06-20) [Move property from migration object to CPU. - Paolo] Signed-off-by: Paolo Bonzini --- hw/i386/pc.c | 1 + target/i386/cpu.c | 15 ++++++++++++--- target/i386/cpu.h | 6 ++++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index d7f48150fd..4668918746 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -88,6 +88,7 @@ GlobalProperty pc_compat_10_0[] = { { TYPE_X86_CPU, "x-consistent-cache", "false" }, { TYPE_X86_CPU, "x-vendor-cpuid-only-v2", "false" }, { TYPE_X86_CPU, "x-arch-cap-always-on", "true" }, + { TYPE_X86_CPU, "x-pdcm-on-even-without-pmu", "true" }, }; const size_t pc_compat_10_0_len = G_N_ELEMENTS(pc_compat_10_0); diff --git a/target/i386/cpu.c b/target/i386/cpu.c index fe369bb128..ab18de894e 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -7908,6 +7908,11 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, /* Fixup overflow: max value for bits 23-16 is 255. */ *ebx |= MIN(num, 255) << 16; } + if (cpu->pdcm_on_even_without_pmu) { + if (!cpu->enable_pmu) { + *ecx &= ~CPUID_EXT_PDCM; + } + } break; case 2: { /* cache info: needed for Pentium Pro compatibility */ const CPUCaches *caches; @@ -8958,9 +8963,11 @@ void x86_cpu_expand_features(X86CPU *cpu, Error **errp) } } - /* PDCM is fixed1 bit for TDX */ - if (!cpu->enable_pmu && !is_tdx_vm()) { - env->features[FEAT_1_ECX] &= ~CPUID_EXT_PDCM; + if (!cpu->pdcm_on_even_without_pmu) { + /* PDCM is fixed1 bit for TDX */ + if (!cpu->enable_pmu && !is_tdx_vm()) { + env->features[FEAT_1_ECX] &= ~CPUID_EXT_PDCM; + } } for (i = 0; i < ARRAY_SIZE(feature_dependencies); i++) { @@ -10021,6 +10028,8 @@ static const Property x86_cpu_properties[] = { DEFINE_PROP_BOOL("x-arch-cap-always-on", X86CPU, arch_cap_always_on, false), + DEFINE_PROP_BOOL("x-pdcm-on-even-without-pmu", X86CPU, + pdcm_on_even_without_pmu, false), }; #ifndef CONFIG_USER_ONLY diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 414ca968e8..42168f1d6d 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -2320,6 +2320,12 @@ struct ArchCPU { */ bool arch_cap_always_on; + /* + * Backwards compatibility with QEMU <10.1. The PDCM feature is now disabled when + * PMU is not available, but prior to 10.1 it was enabled even if PMU is off. + */ + bool pdcm_on_even_without_pmu; + /* Number of physical address bits supported */ uint32_t phys_bits; From 37e12da5df8eb74042f11e9e7bec8a50b8090adb Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:21 +0200 Subject: [PATCH 07/35] accel: Add Meson and config support for MSHV accelerator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a Meson feature option and default-config entry to allow building QEMU with MSHV (Microsoft Hypervisor) acceleration support. This is the first step toward implementing an MSHV backend in QEMU. Signed-off-by: Magnus Kulke Reviewed-by: Daniel P. Berrangé Link: https://lore.kernel.org/r/20250916164847.77883-2-magnuskulke@linux.microsoft.com [Add error for unavailable accelerator. - Paolo] Signed-off-by: Paolo Bonzini --- accel/Kconfig | 3 +++ meson.build | 13 +++++++++++++ meson_options.txt | 2 ++ scripts/meson-buildoptions.sh | 3 +++ 4 files changed, 21 insertions(+) diff --git a/accel/Kconfig b/accel/Kconfig index 4263cab722..a60f114923 100644 --- a/accel/Kconfig +++ b/accel/Kconfig @@ -13,6 +13,9 @@ config TCG config KVM bool +config MSHV + bool + config XEN bool select FSDEV_9P if VIRTFS diff --git a/meson.build b/meson.build index 62766c0f19..167021ed62 100644 --- a/meson.build +++ b/meson.build @@ -334,6 +334,7 @@ elif cpu == 'x86_64' 'CONFIG_HVF': ['x86_64-softmmu'], 'CONFIG_NVMM': ['i386-softmmu', 'x86_64-softmmu'], 'CONFIG_WHPX': ['i386-softmmu', 'x86_64-softmmu'], + 'CONFIG_MSHV': ['x86_64-softmmu'], } endif @@ -883,6 +884,14 @@ accelerators = [] if get_option('kvm').allowed() and host_os == 'linux' accelerators += 'CONFIG_KVM' endif + +if get_option('mshv').allowed() and host_os == 'linux' + if get_option('mshv').enabled() and host_machine.cpu() != 'x86_64' + error('mshv accelerator requires x64_64 host') + endif + accelerators += 'CONFIG_MSHV' +endif + if get_option('whpx').allowed() and host_os == 'windows' if get_option('whpx').enabled() and host_machine.cpu() != 'x86_64' error('WHPX requires 64-bit host') @@ -952,6 +961,9 @@ endif if 'CONFIG_WHPX' not in accelerators and get_option('whpx').enabled() error('WHPX not available on this platform') endif +if 'CONFIG_MSHV' not in accelerators and get_option('mshv').enabled() + error('mshv not available on this platform') +endif xen = not_found if get_option('xen').enabled() or (get_option('xen').auto() and have_system) @@ -4827,6 +4839,7 @@ if have_system summary_info += {'HVF support': config_all_accel.has_key('CONFIG_HVF')} summary_info += {'WHPX support': config_all_accel.has_key('CONFIG_WHPX')} summary_info += {'NVMM support': config_all_accel.has_key('CONFIG_NVMM')} + summary_info += {'MSHV support': config_all_accel.has_key('CONFIG_MSHV')} summary_info += {'Xen support': xen.found()} if xen.found() summary_info += {'xen ctrl version': xen.version()} diff --git a/meson_options.txt b/meson_options.txt index fff1521e58..1be7d61efd 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -71,6 +71,8 @@ option('malloc', type : 'combo', choices : ['system', 'tcmalloc', 'jemalloc'], option('kvm', type: 'feature', value: 'auto', description: 'KVM acceleration support') +option('mshv', type: 'feature', value: 'auto', + description: 'MSHV acceleration support') option('whpx', type: 'feature', value: 'auto', description: 'WHPX acceleration support') option('hvf', type: 'feature', value: 'auto', diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 0ebe6bc52a..d3d8f48bbb 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -154,6 +154,7 @@ meson_options_help() { printf "%s\n" ' membarrier membarrier system call (for Linux 4.14+ or Windows' printf "%s\n" ' modules modules support (non Windows)' printf "%s\n" ' mpath Multipath persistent reservation passthrough' + printf "%s\n" ' mshv MSHV acceleration support' printf "%s\n" ' multiprocess Out of process device emulation support' printf "%s\n" ' netmap netmap network backend support' printf "%s\n" ' nettle nettle cryptography support' @@ -408,6 +409,8 @@ _meson_option_parse() { --disable-modules) printf "%s" -Dmodules=disabled ;; --enable-mpath) printf "%s" -Dmpath=enabled ;; --disable-mpath) printf "%s" -Dmpath=disabled ;; + --enable-mshv) printf "%s" -Dmshv=enabled ;; + --disable-mshv) printf "%s" -Dmshv=disabled ;; --enable-multiprocess) printf "%s" -Dmultiprocess=enabled ;; --disable-multiprocess) printf "%s" -Dmultiprocess=disabled ;; --enable-netmap) printf "%s" -Dnetmap=enabled ;; From 1e25327b244a217078e3fa5df18322c70932f478 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:22 +0200 Subject: [PATCH 08/35] target/i386/emulate: Allow instruction decoding from stream Introduce a new helper function to decode x86 instructions from a raw instruction byte stream. MSHV delivers an instruction stream in a buffer of the vm_exit message. It can be used to speed up MMIO emulation, since instructions do not have to be fetched and translated. Added "fetch_instruction()" op to x86_emul_ops() to improve traceability. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-3-magnuskulke@linux.microsoft.com Signed-off-by: Paolo Bonzini --- target/i386/emulate/x86_decode.c | 27 +++++++++++++++++++++++---- target/i386/emulate/x86_decode.h | 9 +++++++++ target/i386/emulate/x86_emu.c | 3 ++- target/i386/emulate/x86_emu.h | 2 ++ 4 files changed, 36 insertions(+), 5 deletions(-) diff --git a/target/i386/emulate/x86_decode.c b/target/i386/emulate/x86_decode.c index 2eca39802e..97bd6f1a3b 100644 --- a/target/i386/emulate/x86_decode.c +++ b/target/i386/emulate/x86_decode.c @@ -71,10 +71,16 @@ static inline uint64_t decode_bytes(CPUX86State *env, struct x86_decode *decode, VM_PANIC_EX("%s invalid size %d\n", __func__, size); break; } - target_ulong va = linear_rip(env_cpu(env), env->eip) + decode->len; - emul_ops->read_mem(env_cpu(env), &val, va, size); + + /* copy the bytes from the instruction stream, if available */ + if (decode->stream && decode->len + size <= decode->stream->len) { + memcpy(&val, decode->stream->bytes + decode->len, size); + } else { + target_ulong va = linear_rip(env_cpu(env), env->eip) + decode->len; + emul_ops->fetch_instruction(env_cpu(env), &val, va, size); + } decode->len += size; - + return val; } @@ -2076,9 +2082,10 @@ static void decode_opcodes(CPUX86State *env, struct x86_decode *decode) } } -uint32_t decode_instruction(CPUX86State *env, struct x86_decode *decode) +static uint32_t decode_opcode(CPUX86State *env, struct x86_decode *decode) { memset(decode, 0, sizeof(*decode)); + decode_prefix(env, decode); set_addressing_size(env, decode); set_operand_size(env, decode); @@ -2088,6 +2095,18 @@ uint32_t decode_instruction(CPUX86State *env, struct x86_decode *decode) return decode->len; } +uint32_t decode_instruction(CPUX86State *env, struct x86_decode *decode) +{ + return decode_opcode(env, decode); +} + +uint32_t decode_instruction_stream(CPUX86State *env, struct x86_decode *decode, + struct x86_insn_stream *stream) +{ + decode->stream = stream; + return decode_opcode(env, decode); +} + void init_decoder(void) { int i; diff --git a/target/i386/emulate/x86_decode.h b/target/i386/emulate/x86_decode.h index 927645af1a..1cadf3694f 100644 --- a/target/i386/emulate/x86_decode.h +++ b/target/i386/emulate/x86_decode.h @@ -272,6 +272,11 @@ typedef struct x86_decode_op { }; } x86_decode_op; +typedef struct x86_insn_stream { + const uint8_t *bytes; + size_t len; +} x86_insn_stream; + typedef struct x86_decode { int len; uint8_t opcode[4]; @@ -298,11 +303,15 @@ typedef struct x86_decode { struct x86_modrm modrm; struct x86_decode_op op[4]; bool is_fpu; + + x86_insn_stream *stream; } x86_decode; uint64_t sign(uint64_t val, int size); uint32_t decode_instruction(CPUX86State *env, struct x86_decode *decode); +uint32_t decode_instruction_stream(CPUX86State *env, struct x86_decode *decode, + struct x86_insn_stream *stream); void *get_reg_ref(CPUX86State *env, int reg, int rex_present, int is_extended, int size); diff --git a/target/i386/emulate/x86_emu.c b/target/i386/emulate/x86_emu.c index db7a7f7437..4409f7bc13 100644 --- a/target/i386/emulate/x86_emu.c +++ b/target/i386/emulate/x86_emu.c @@ -1246,7 +1246,8 @@ static void init_cmd_handler(void) bool exec_instruction(CPUX86State *env, struct x86_decode *ins) { if (!_cmd_handler[ins->cmd].handler) { - printf("Unimplemented handler (" TARGET_FMT_lx ") for %d (%x %x) \n", env->eip, + printf("Unimplemented handler (" TARGET_FMT_lx ") for %d (%x %x)\n", + env->eip, ins->cmd, ins->opcode[0], ins->opcode_len > 1 ? ins->opcode[1] : 0); env->eip += ins->len; diff --git a/target/i386/emulate/x86_emu.h b/target/i386/emulate/x86_emu.h index a1a961284b..05686b162f 100644 --- a/target/i386/emulate/x86_emu.h +++ b/target/i386/emulate/x86_emu.h @@ -24,6 +24,8 @@ #include "cpu.h" struct x86_emul_ops { + void (*fetch_instruction)(CPUState *cpu, void *data, target_ulong addr, + int bytes); void (*read_mem)(CPUState *cpu, void *data, target_ulong addr, int bytes); void (*write_mem)(CPUState *cpu, void *data, target_ulong addr, int bytes); void (*read_segment_descriptor)(CPUState *cpu, struct x86_segment_descriptor *desc, From 0daf817c80b57e58168309420abf0a8a3d2a60f6 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:23 +0200 Subject: [PATCH 09/35] target/i386/mshv: Add x86 decoder/emu implementation The MSHV accelerator requires a x86 decoder/emulator in userland to emulate MMIO instructions. This change contains the implementations for the generalized i386 instruction decoder/emulator. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-4-magnuskulke@linux.microsoft.com Signed-off-by: Paolo Bonzini --- include/system/mshv.h | 25 +++ target/i386/cpu.h | 2 +- target/i386/emulate/meson.build | 7 +- target/i386/meson.build | 2 + target/i386/mshv/meson.build | 7 + target/i386/mshv/x86.c | 297 ++++++++++++++++++++++++++++++++ 6 files changed, 337 insertions(+), 3 deletions(-) create mode 100644 include/system/mshv.h create mode 100644 target/i386/mshv/meson.build create mode 100644 target/i386/mshv/x86.c diff --git a/include/system/mshv.h b/include/system/mshv.h new file mode 100644 index 0000000000..342f1ef6a9 --- /dev/null +++ b/include/system/mshv.h @@ -0,0 +1,25 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Ziqiao Zhou + * Magnus Kulke + * Jinank Jain + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + */ + +#ifndef QEMU_MSHV_H +#define QEMU_MSHV_H + +#ifdef COMPILING_PER_TARGET +#ifdef CONFIG_MSHV +#define CONFIG_MSHV_IS_POSSIBLE +#endif +#else +#define CONFIG_MSHV_IS_POSSIBLE +#endif + +#endif diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 42168f1d6d..3aec8fd41c 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -2126,7 +2126,7 @@ typedef struct CPUArchState { QEMUTimer *xen_periodic_timer; QemuMutex xen_timers_lock; #endif -#if defined(CONFIG_HVF) +#if defined(CONFIG_HVF) || defined(CONFIG_MSHV) void *emu_mmio_buf; #endif diff --git a/target/i386/emulate/meson.build b/target/i386/emulate/meson.build index 4edd4f462f..b6dafb6a5b 100644 --- a/target/i386/emulate/meson.build +++ b/target/i386/emulate/meson.build @@ -1,5 +1,8 @@ -i386_system_ss.add(when: [hvf, 'CONFIG_HVF'], if_true: files( +emulator_files = files( 'x86_decode.c', 'x86_emu.c', 'x86_flags.c', -)) +) + +i386_system_ss.add(when: [hvf, 'CONFIG_HVF'], if_true: emulator_files) +i386_system_ss.add(when: 'CONFIG_MSHV', if_true: emulator_files) diff --git a/target/i386/meson.build b/target/i386/meson.build index 092af34e2d..89ba4912aa 100644 --- a/target/i386/meson.build +++ b/target/i386/meson.build @@ -13,6 +13,7 @@ i386_ss.add(when: 'CONFIG_KVM', if_true: files('host-cpu.c')) i386_ss.add(when: 'CONFIG_HVF', if_true: files('host-cpu.c')) i386_ss.add(when: 'CONFIG_WHPX', if_true: files('host-cpu.c')) i386_ss.add(when: 'CONFIG_NVMM', if_true: files('host-cpu.c')) +i386_ss.add(when: 'CONFIG_MSHV', if_true: files('host-cpu.c')) i386_system_ss = ss.source_set() i386_system_ss.add(files( @@ -34,6 +35,7 @@ subdir('nvmm') subdir('hvf') subdir('tcg') subdir('emulate') +subdir('mshv') target_arch += {'i386': i386_ss} target_system_arch += {'i386': i386_system_ss} diff --git a/target/i386/mshv/meson.build b/target/i386/mshv/meson.build new file mode 100644 index 0000000000..8ddaa7c11d --- /dev/null +++ b/target/i386/mshv/meson.build @@ -0,0 +1,7 @@ +i386_mshv_ss = ss.source_set() + +i386_mshv_ss.add(files( + 'x86.c', +)) + +i386_system_ss.add_all(when: 'CONFIG_MSHV', if_true: i386_mshv_ss) diff --git a/target/i386/mshv/x86.c b/target/i386/mshv/x86.c new file mode 100644 index 0000000000..d574b3bc52 --- /dev/null +++ b/target/i386/mshv/x86.c @@ -0,0 +1,297 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Magnus Kulke + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" + +#include "cpu.h" +#include "emulate/x86_decode.h" +#include "emulate/x86_emu.h" +#include "qemu/typedefs.h" +#include "qemu/error-report.h" +#include "system/mshv.h" + +/* RW or Exec segment */ +static const uint8_t RWRX_SEGMENT_TYPE = 0x2; +static const uint8_t CODE_SEGMENT_TYPE = 0x8; +static const uint8_t EXPAND_DOWN_SEGMENT_TYPE = 0x4; + +typedef enum CpuMode { + REAL_MODE, + PROTECTED_MODE, + LONG_MODE, +} CpuMode; + +static CpuMode cpu_mode(CPUState *cpu) +{ + enum CpuMode m = REAL_MODE; + + if (x86_is_protected(cpu)) { + m = PROTECTED_MODE; + + if (x86_is_long_mode(cpu)) { + m = LONG_MODE; + } + } + + return m; +} + +static bool segment_type_ro(const SegmentCache *seg) +{ + uint32_t type_ = (seg->flags >> DESC_TYPE_SHIFT) & 15; + return (type_ & (~RWRX_SEGMENT_TYPE)) == 0; +} + +static bool segment_type_code(const SegmentCache *seg) +{ + uint32_t type_ = (seg->flags >> DESC_TYPE_SHIFT) & 15; + return (type_ & CODE_SEGMENT_TYPE) != 0; +} + +static bool segment_expands_down(const SegmentCache *seg) +{ + uint32_t type_ = (seg->flags >> DESC_TYPE_SHIFT) & 15; + + if (segment_type_code(seg)) { + return false; + } + + return (type_ & EXPAND_DOWN_SEGMENT_TYPE) != 0; +} + +static uint32_t segment_limit(const SegmentCache *seg) +{ + uint32_t limit = seg->limit; + uint32_t granularity = (seg->flags & DESC_G_MASK) != 0; + + if (granularity != 0) { + limit = (limit << 12) | 0xFFF; + } + + return limit; +} + +static uint8_t segment_db(const SegmentCache *seg) +{ + return (seg->flags >> DESC_B_SHIFT) & 1; +} + +static uint32_t segment_max_limit(const SegmentCache *seg) +{ + if (segment_db(seg) != 0) { + return 0xFFFFFFFF; + } + return 0xFFFF; +} + +static int linearize(CPUState *cpu, + target_ulong logical_addr, target_ulong *linear_addr, + X86Seg seg_idx) +{ + enum CpuMode mode; + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + SegmentCache *seg = &env->segs[seg_idx]; + target_ulong base = seg->base; + target_ulong logical_addr_32b; + uint32_t limit; + /* TODO: the emulator will not pass us "write" indicator yet */ + bool write = false; + + mode = cpu_mode(cpu); + + switch (mode) { + case LONG_MODE: + if (__builtin_add_overflow(logical_addr, base, linear_addr)) { + error_report("Address overflow"); + return -1; + } + break; + case PROTECTED_MODE: + case REAL_MODE: + if (segment_type_ro(seg) && write) { + error_report("Cannot write to read-only segment"); + return -1; + } + + logical_addr_32b = logical_addr & 0xFFFFFFFF; + limit = segment_limit(seg); + + if (segment_expands_down(seg)) { + if (logical_addr_32b >= limit) { + error_report("Address exceeds limit (expands down)"); + return -1; + } + + limit = segment_max_limit(seg); + } + + if (logical_addr_32b > limit) { + error_report("Address exceeds limit %u", limit); + return -1; + } + *linear_addr = logical_addr_32b + base; + break; + default: + error_report("Unknown cpu mode: %d", mode); + return -1; + } + + return 0; +} + +bool x86_read_segment_descriptor(CPUState *cpu, + struct x86_segment_descriptor *desc, + x86_segment_selector sel) +{ + target_ulong base; + uint32_t limit; + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + target_ulong gva; + + memset(desc, 0, sizeof(*desc)); + + /* valid gdt descriptors start from index 1 */ + if (!sel.index && GDT_SEL == sel.ti) { + return false; + } + + if (GDT_SEL == sel.ti) { + base = env->gdt.base; + limit = env->gdt.limit; + } else { + base = env->ldt.base; + limit = env->ldt.limit; + } + + if (sel.index * 8 >= limit) { + return false; + } + + gva = base + sel.index * 8; + emul_ops->read_mem(cpu, desc, gva, sizeof(*desc)); + + return true; +} + +bool x86_read_call_gate(CPUState *cpu, struct x86_call_gate *idt_desc, + int gate) +{ + target_ulong base; + uint32_t limit; + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + target_ulong gva; + + base = env->idt.base; + limit = env->idt.limit; + + memset(idt_desc, 0, sizeof(*idt_desc)); + if (gate * 8 >= limit) { + perror("call gate exceeds idt limit"); + return false; + } + + gva = base + gate * 8; + emul_ops->read_mem(cpu, idt_desc, gva, sizeof(*idt_desc)); + + return true; +} + +bool x86_is_protected(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + uint64_t cr0 = env->cr[0]; + + return cr0 & CR0_PE_MASK; +} + +bool x86_is_real(CPUState *cpu) +{ + return !x86_is_protected(cpu); +} + +bool x86_is_v8086(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + return x86_is_protected(cpu) && (env->eflags & VM_MASK); +} + +bool x86_is_long_mode(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + uint64_t efer = env->efer; + uint64_t lme_lma = (MSR_EFER_LME | MSR_EFER_LMA); + + return ((efer & lme_lma) == lme_lma); +} + +bool x86_is_long64_mode(CPUState *cpu) +{ + error_report("unimplemented: is_long64_mode()"); + abort(); +} + +bool x86_is_paging_mode(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + uint64_t cr0 = env->cr[0]; + + return cr0 & CR0_PG_MASK; +} + +bool x86_is_pae_enabled(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + uint64_t cr4 = env->cr[4]; + + return cr4 & CR4_PAE_MASK; +} + +target_ulong linear_addr(CPUState *cpu, target_ulong addr, X86Seg seg) +{ + int ret; + target_ulong linear_addr; + + ret = linearize(cpu, addr, &linear_addr, seg); + if (ret < 0) { + error_report("failed to linearize address"); + abort(); + } + + return linear_addr; +} + +target_ulong linear_addr_size(CPUState *cpu, target_ulong addr, int size, + X86Seg seg) +{ + switch (size) { + case 2: + addr = (uint16_t)addr; + break; + case 4: + addr = (uint32_t)addr; + break; + default: + break; + } + return linear_addr(cpu, addr, seg); +} + +target_ulong linear_rip(CPUState *cpu, target_ulong rip) +{ + return linear_addr(cpu, rip, R_CS); +} From 638ac1c78457dd93ccd795b9c6c2673af8c7dd21 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:24 +0200 Subject: [PATCH 10/35] hw/intc: Generalize APIC helper names from kvm_* to accel_* Rename APIC helper functions to use an accel_* prefix instead of kvm_* to support use by accelerators other than KVM. This is a preparatory step for integrating MSHV support with common APIC logic. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-5-magnuskulke@linux.microsoft.com [Remove dead definition of mshv_msi_via_irqfd_enabled. - Paolo] Signed-off-by: Paolo Bonzini --- accel/accel-irq.c | 106 +++++++++++++++++++++++++++++++++++++ accel/meson.build | 2 +- hw/intc/ioapic.c | 20 ++++--- hw/virtio/virtio-pci.c | 21 ++++---- include/system/accel-irq.h | 37 +++++++++++++ include/system/mshv.h | 17 ++++++ 6 files changed, 185 insertions(+), 18 deletions(-) create mode 100644 accel/accel-irq.c create mode 100644 include/system/accel-irq.h diff --git a/accel/accel-irq.c b/accel/accel-irq.c new file mode 100644 index 0000000000..7f864e35c4 --- /dev/null +++ b/accel/accel-irq.c @@ -0,0 +1,106 @@ +/* + * Accelerated irqchip abstraction + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Ziqiao Zhou + * Magnus Kulke + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "hw/pci/msi.h" + +#include "system/kvm.h" +#include "system/mshv.h" +#include "system/accel-irq.h" + +int accel_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev) +{ +#ifdef CONFIG_MSHV_IS_POSSIBLE + if (mshv_msi_via_irqfd_enabled()) { + return mshv_irqchip_add_msi_route(vector, dev); + } +#endif + if (kvm_enabled()) { + return kvm_irqchip_add_msi_route(c, vector, dev); + } + return -ENOSYS; +} + +int accel_irqchip_update_msi_route(int vector, MSIMessage msg, PCIDevice *dev) +{ +#ifdef CONFIG_MSHV_IS_POSSIBLE + if (mshv_msi_via_irqfd_enabled()) { + return mshv_irqchip_update_msi_route(vector, msg, dev); + } +#endif + if (kvm_enabled()) { + return kvm_irqchip_update_msi_route(kvm_state, vector, msg, dev); + } + return -ENOSYS; +} + +void accel_irqchip_commit_route_changes(KVMRouteChange *c) +{ +#ifdef CONFIG_MSHV_IS_POSSIBLE + if (mshv_msi_via_irqfd_enabled()) { + mshv_irqchip_commit_routes(); + } +#endif + if (kvm_enabled()) { + kvm_irqchip_commit_route_changes(c); + } +} + +void accel_irqchip_commit_routes(void) +{ +#ifdef CONFIG_MSHV_IS_POSSIBLE + if (mshv_msi_via_irqfd_enabled()) { + mshv_irqchip_commit_routes(); + } +#endif + if (kvm_enabled()) { + kvm_irqchip_commit_routes(kvm_state); + } +} + +void accel_irqchip_release_virq(int virq) +{ +#ifdef CONFIG_MSHV_IS_POSSIBLE + if (mshv_msi_via_irqfd_enabled()) { + mshv_irqchip_release_virq(virq); + } +#endif + if (kvm_enabled()) { + kvm_irqchip_release_virq(kvm_state, virq); + } +} + +int accel_irqchip_add_irqfd_notifier_gsi(EventNotifier *n, EventNotifier *rn, + int virq) +{ +#ifdef CONFIG_MSHV_IS_POSSIBLE + if (mshv_msi_via_irqfd_enabled()) { + return mshv_irqchip_add_irqfd_notifier_gsi(n, rn, virq); + } +#endif + if (kvm_enabled()) { + return kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, rn, virq); + } + return -ENOSYS; +} + +int accel_irqchip_remove_irqfd_notifier_gsi(EventNotifier *n, int virq) +{ +#ifdef CONFIG_MSHV_IS_POSSIBLE + if (mshv_msi_via_irqfd_enabled()) { + return mshv_irqchip_remove_irqfd_notifier_gsi(n, virq); + } +#endif + if (kvm_enabled()) { + return kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n, virq); + } + return -ENOSYS; +} diff --git a/accel/meson.build b/accel/meson.build index 25b0f100b5..6349efe682 100644 --- a/accel/meson.build +++ b/accel/meson.build @@ -1,6 +1,6 @@ common_ss.add(files('accel-common.c')) specific_ss.add(files('accel-target.c')) -system_ss.add(files('accel-system.c', 'accel-blocker.c', 'accel-qmp.c')) +system_ss.add(files('accel-system.c', 'accel-blocker.c', 'accel-qmp.c', 'accel-irq.c')) user_ss.add(files('accel-user.c')) subdir('tcg') diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c index 133bef852d..e431d00311 100644 --- a/hw/intc/ioapic.c +++ b/hw/intc/ioapic.c @@ -30,12 +30,18 @@ #include "hw/intc/ioapic_internal.h" #include "hw/pci/msi.h" #include "hw/qdev-properties.h" +#include "system/accel-irq.h" #include "system/kvm.h" #include "system/system.h" #include "hw/i386/apic-msidef.h" #include "hw/i386/x86-iommu.h" #include "trace.h" + +#if defined(CONFIG_KVM) || defined(CONFIG_MSHV) +#define ACCEL_GSI_IRQFD_POSSIBLE +#endif + #define APIC_DELIVERY_MODE_SHIFT 8 #define APIC_POLARITY_SHIFT 14 #define APIC_TRIG_MODE_SHIFT 15 @@ -191,10 +197,10 @@ static void ioapic_set_irq(void *opaque, int vector, int level) static void ioapic_update_kvm_routes(IOAPICCommonState *s) { -#ifdef CONFIG_KVM +#ifdef ACCEL_GSI_IRQFD_POSSIBLE int i; - if (kvm_irqchip_is_split()) { + if (accel_irqchip_is_split()) { for (i = 0; i < IOAPIC_NUM_PINS; i++) { MSIMessage msg; struct ioapic_entry_info info; @@ -202,15 +208,15 @@ static void ioapic_update_kvm_routes(IOAPICCommonState *s) if (!info.masked) { msg.address = info.addr; msg.data = info.data; - kvm_irqchip_update_msi_route(kvm_state, i, msg, NULL); + accel_irqchip_update_msi_route(i, msg, NULL); } } - kvm_irqchip_commit_routes(kvm_state); + accel_irqchip_commit_routes(); } #endif } -#ifdef CONFIG_KVM +#ifdef ACCEL_KERNEL_GSI_IRQFD_POSSIBLE static void ioapic_iec_notifier(void *private, bool global, uint32_t index, uint32_t mask) { @@ -428,11 +434,11 @@ static const MemoryRegionOps ioapic_io_ops = { static void ioapic_machine_done_notify(Notifier *notifier, void *data) { -#ifdef CONFIG_KVM +#ifdef ACCEL_KERNEL_GSI_IRQFD_POSSIBLE IOAPICCommonState *s = container_of(notifier, IOAPICCommonState, machine_done); - if (kvm_irqchip_is_split()) { + if (accel_irqchip_is_split()) { X86IOMMUState *iommu = x86_iommu_get_default(); if (iommu) { /* Register this IOAPIC with IOMMU IEC notifier, so that diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 767216d795..0cdc16217f 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -34,6 +34,7 @@ #include "hw/pci/msi.h" #include "hw/pci/msix.h" #include "hw/loader.h" +#include "system/accel-irq.h" #include "system/kvm.h" #include "hw/virtio/virtio-pci.h" #include "qemu/range.h" @@ -825,11 +826,11 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, if (irqfd->users == 0) { KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state); - ret = kvm_irqchip_add_msi_route(&c, vector, &proxy->pci_dev); + ret = accel_irqchip_add_msi_route(&c, vector, &proxy->pci_dev); if (ret < 0) { return ret; } - kvm_irqchip_commit_route_changes(&c); + accel_irqchip_commit_route_changes(&c); irqfd->virq = ret; } irqfd->users++; @@ -841,7 +842,7 @@ static void kvm_virtio_pci_vq_vector_release(VirtIOPCIProxy *proxy, { VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector]; if (--irqfd->users == 0) { - kvm_irqchip_release_virq(kvm_state, irqfd->virq); + accel_irqchip_release_virq(irqfd->virq); } } @@ -850,7 +851,7 @@ static int kvm_virtio_pci_irqfd_use(VirtIOPCIProxy *proxy, unsigned int vector) { VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector]; - return kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, irqfd->virq); + return accel_irqchip_add_irqfd_notifier_gsi(n, NULL, irqfd->virq); } static void kvm_virtio_pci_irqfd_release(VirtIOPCIProxy *proxy, @@ -860,7 +861,7 @@ static void kvm_virtio_pci_irqfd_release(VirtIOPCIProxy *proxy, VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector]; int ret; - ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n, irqfd->virq); + ret = accel_irqchip_remove_irqfd_notifier_gsi(n, irqfd->virq); assert(ret == 0); } static int virtio_pci_get_notifier(VirtIOPCIProxy *proxy, int queue_no, @@ -995,12 +996,12 @@ static int virtio_pci_one_vector_unmask(VirtIOPCIProxy *proxy, if (proxy->vector_irqfd) { irqfd = &proxy->vector_irqfd[vector]; if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { - ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg, - &proxy->pci_dev); + ret = accel_irqchip_update_msi_route(irqfd->virq, msg, + &proxy->pci_dev); if (ret < 0) { return ret; } - kvm_irqchip_commit_routes(kvm_state); + accel_irqchip_commit_routes(); } } @@ -1229,7 +1230,7 @@ static int virtio_pci_set_guest_notifiers(DeviceState *d, int nvqs, bool assign) VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); int r, n; bool with_irqfd = msix_enabled(&proxy->pci_dev) && - kvm_msi_via_irqfd_enabled(); + accel_msi_via_irqfd_enabled() ; nvqs = MIN(nvqs, VIRTIO_QUEUE_MAX); @@ -1433,7 +1434,7 @@ static void virtio_pci_set_vector(VirtIODevice *vdev, uint16_t new_vector) { bool kvm_irqfd = (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) && - msix_enabled(&proxy->pci_dev) && kvm_msi_via_irqfd_enabled(); + msix_enabled(&proxy->pci_dev) && accel_msi_via_irqfd_enabled(); if (new_vector == old_vector) { return; diff --git a/include/system/accel-irq.h b/include/system/accel-irq.h new file mode 100644 index 0000000000..671fb7dfdb --- /dev/null +++ b/include/system/accel-irq.h @@ -0,0 +1,37 @@ +/* + * Accelerated irqchip abstraction + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Ziqiao Zhou + * Magnus Kulke + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef SYSTEM_ACCEL_IRQ_H +#define SYSTEM_ACCEL_IRQ_H +#include "hw/pci/msi.h" +#include "qemu/osdep.h" +#include "system/kvm.h" +#include "system/mshv.h" + +static inline bool accel_msi_via_irqfd_enabled(void) +{ + return mshv_msi_via_irqfd_enabled() || kvm_msi_via_irqfd_enabled(); +} + +static inline bool accel_irqchip_is_split(void) +{ + return mshv_msi_via_irqfd_enabled() || kvm_irqchip_is_split(); +} + +int accel_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev); +int accel_irqchip_update_msi_route(int vector, MSIMessage msg, PCIDevice *dev); +void accel_irqchip_commit_route_changes(KVMRouteChange *c); +void accel_irqchip_commit_routes(void); +void accel_irqchip_release_virq(int virq); +int accel_irqchip_add_irqfd_notifier_gsi(EventNotifier *n, EventNotifier *rn, + int virq); +int accel_irqchip_remove_irqfd_notifier_gsi(EventNotifier *n, int virq); +#endif diff --git a/include/system/mshv.h b/include/system/mshv.h index 342f1ef6a9..2a504ed81f 100644 --- a/include/system/mshv.h +++ b/include/system/mshv.h @@ -22,4 +22,21 @@ #define CONFIG_MSHV_IS_POSSIBLE #endif +#ifdef CONFIG_MSHV_IS_POSSIBLE +extern bool mshv_allowed; +#define mshv_enabled() (mshv_allowed) +#else /* CONFIG_MSHV_IS_POSSIBLE */ +#define mshv_enabled() false +#endif +#define mshv_msi_via_irqfd_enabled() false + +/* interrupt */ +int mshv_irqchip_add_msi_route(int vector, PCIDevice *dev); +int mshv_irqchip_update_msi_route(int virq, MSIMessage msg, PCIDevice *dev); +void mshv_irqchip_commit_routes(void); +void mshv_irqchip_release_virq(int virq); +int mshv_irqchip_add_irqfd_notifier_gsi(const EventNotifier *n, + const EventNotifier *rn, int virq); +int mshv_irqchip_remove_irqfd_notifier_gsi(const EventNotifier *n, int virq); + #endif From 7db6086287502ad5a015ccc31e85ec9cac7e2716 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:25 +0200 Subject: [PATCH 11/35] include/hw/hyperv: Add MSHV ABI header definitions Introduce headers for the Microsoft Hypervisor (MSHV) userspace ABI, including IOCTLs and structures used to interface with the hypervisor. These definitions are based on the upstream Linux MSHV interface and will be used by the MSHV accelerator backend in later patches. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-6-magnuskulke@linux.microsoft.com [Do not use __uN types. - Paolo] Signed-off-by: Paolo Bonzini --- include/hw/hyperv/hvgdk.h | 20 + include/hw/hyperv/hvgdk_mini.h | 817 ++++++++++++++++++++++++++++++++ include/hw/hyperv/hvhdk.h | 249 ++++++++++ include/hw/hyperv/hvhdk_mini.h | 102 ++++ scripts/update-linux-headers.sh | 2 +- 5 files changed, 1189 insertions(+), 1 deletion(-) create mode 100644 include/hw/hyperv/hvgdk.h create mode 100644 include/hw/hyperv/hvgdk_mini.h create mode 100644 include/hw/hyperv/hvhdk.h create mode 100644 include/hw/hyperv/hvhdk_mini.h diff --git a/include/hw/hyperv/hvgdk.h b/include/hw/hyperv/hvgdk.h new file mode 100644 index 0000000000..71161f477c --- /dev/null +++ b/include/hw/hyperv/hvgdk.h @@ -0,0 +1,20 @@ +/* + * Type definitions for the mshv guest interface. + * + * Copyright Microsoft, Corp. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_HYPERV_HVGDK_H +#define HW_HYPERV_HVGDK_H + +#define HVGDK_H_VERSION (25125) + +enum hv_unimplemented_msr_action { + HV_UNIMPLEMENTED_MSR_ACTION_FAULT = 0, + HV_UNIMPLEMENTED_MSR_ACTION_IGNORE_WRITE_READ_ZERO = 1, + HV_UNIMPLEMENTED_MSR_ACTION_COUNT = 2, +}; + +#endif /* HW_HYPERV_HVGDK_H */ diff --git a/include/hw/hyperv/hvgdk_mini.h b/include/hw/hyperv/hvgdk_mini.h new file mode 100644 index 0000000000..d89315f545 --- /dev/null +++ b/include/hw/hyperv/hvgdk_mini.h @@ -0,0 +1,817 @@ +/* + * Userspace interfaces for /dev/mshv* devices and derived fds + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_HYPERV_HVGDK_MINI_H +#define HW_HYPERV_HVGDK_MINI_H + +#define MSHV_IOCTL 0xB8 + +typedef enum hv_register_name { + /* Pending Interruption Register */ + HV_REGISTER_PENDING_INTERRUPTION = 0x00010002, + + /* X64 User-Mode Registers */ + HV_X64_REGISTER_RAX = 0x00020000, + HV_X64_REGISTER_RCX = 0x00020001, + HV_X64_REGISTER_RDX = 0x00020002, + HV_X64_REGISTER_RBX = 0x00020003, + HV_X64_REGISTER_RSP = 0x00020004, + HV_X64_REGISTER_RBP = 0x00020005, + HV_X64_REGISTER_RSI = 0x00020006, + HV_X64_REGISTER_RDI = 0x00020007, + HV_X64_REGISTER_R8 = 0x00020008, + HV_X64_REGISTER_R9 = 0x00020009, + HV_X64_REGISTER_R10 = 0x0002000A, + HV_X64_REGISTER_R11 = 0x0002000B, + HV_X64_REGISTER_R12 = 0x0002000C, + HV_X64_REGISTER_R13 = 0x0002000D, + HV_X64_REGISTER_R14 = 0x0002000E, + HV_X64_REGISTER_R15 = 0x0002000F, + HV_X64_REGISTER_RIP = 0x00020010, + HV_X64_REGISTER_RFLAGS = 0x00020011, + + /* X64 Floating Point and Vector Registers */ + HV_X64_REGISTER_XMM0 = 0x00030000, + HV_X64_REGISTER_XMM1 = 0x00030001, + HV_X64_REGISTER_XMM2 = 0x00030002, + HV_X64_REGISTER_XMM3 = 0x00030003, + HV_X64_REGISTER_XMM4 = 0x00030004, + HV_X64_REGISTER_XMM5 = 0x00030005, + HV_X64_REGISTER_XMM6 = 0x00030006, + HV_X64_REGISTER_XMM7 = 0x00030007, + HV_X64_REGISTER_XMM8 = 0x00030008, + HV_X64_REGISTER_XMM9 = 0x00030009, + HV_X64_REGISTER_XMM10 = 0x0003000A, + HV_X64_REGISTER_XMM11 = 0x0003000B, + HV_X64_REGISTER_XMM12 = 0x0003000C, + HV_X64_REGISTER_XMM13 = 0x0003000D, + HV_X64_REGISTER_XMM14 = 0x0003000E, + HV_X64_REGISTER_XMM15 = 0x0003000F, + HV_X64_REGISTER_FP_MMX0 = 0x00030010, + HV_X64_REGISTER_FP_MMX1 = 0x00030011, + HV_X64_REGISTER_FP_MMX2 = 0x00030012, + HV_X64_REGISTER_FP_MMX3 = 0x00030013, + HV_X64_REGISTER_FP_MMX4 = 0x00030014, + HV_X64_REGISTER_FP_MMX5 = 0x00030015, + HV_X64_REGISTER_FP_MMX6 = 0x00030016, + HV_X64_REGISTER_FP_MMX7 = 0x00030017, + HV_X64_REGISTER_FP_CONTROL_STATUS = 0x00030018, + HV_X64_REGISTER_XMM_CONTROL_STATUS = 0x00030019, + + /* X64 Control Registers */ + HV_X64_REGISTER_CR0 = 0x00040000, + HV_X64_REGISTER_CR2 = 0x00040001, + HV_X64_REGISTER_CR3 = 0x00040002, + HV_X64_REGISTER_CR4 = 0x00040003, + HV_X64_REGISTER_CR8 = 0x00040004, + HV_X64_REGISTER_XFEM = 0x00040005, + + /* X64 Segment Registers */ + HV_X64_REGISTER_ES = 0x00060000, + HV_X64_REGISTER_CS = 0x00060001, + HV_X64_REGISTER_SS = 0x00060002, + HV_X64_REGISTER_DS = 0x00060003, + HV_X64_REGISTER_FS = 0x00060004, + HV_X64_REGISTER_GS = 0x00060005, + HV_X64_REGISTER_LDTR = 0x00060006, + HV_X64_REGISTER_TR = 0x00060007, + + /* X64 Table Registers */ + HV_X64_REGISTER_IDTR = 0x00070000, + HV_X64_REGISTER_GDTR = 0x00070001, + + /* X64 Virtualized MSRs */ + HV_X64_REGISTER_TSC = 0x00080000, + HV_X64_REGISTER_EFER = 0x00080001, + HV_X64_REGISTER_KERNEL_GS_BASE = 0x00080002, + HV_X64_REGISTER_APIC_BASE = 0x00080003, + HV_X64_REGISTER_PAT = 0x00080004, + HV_X64_REGISTER_SYSENTER_CS = 0x00080005, + HV_X64_REGISTER_SYSENTER_EIP = 0x00080006, + HV_X64_REGISTER_SYSENTER_ESP = 0x00080007, + HV_X64_REGISTER_STAR = 0x00080008, + HV_X64_REGISTER_LSTAR = 0x00080009, + HV_X64_REGISTER_CSTAR = 0x0008000A, + HV_X64_REGISTER_SFMASK = 0x0008000B, + HV_X64_REGISTER_INITIAL_APIC_ID = 0x0008000C, + + /* X64 Cache control MSRs */ + HV_X64_REGISTER_MSR_MTRR_CAP = 0x0008000D, + HV_X64_REGISTER_MSR_MTRR_DEF_TYPE = 0x0008000E, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0 = 0x00080010, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1 = 0x00080011, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2 = 0x00080012, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3 = 0x00080013, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4 = 0x00080014, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5 = 0x00080015, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6 = 0x00080016, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7 = 0x00080017, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8 = 0x00080018, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9 = 0x00080019, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA = 0x0008001A, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB = 0x0008001B, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC = 0x0008001C, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASED = 0x0008001D, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE = 0x0008001E, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF = 0x0008001F, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0 = 0x00080040, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1 = 0x00080041, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2 = 0x00080042, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3 = 0x00080043, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4 = 0x00080044, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5 = 0x00080045, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6 = 0x00080046, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7 = 0x00080047, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8 = 0x00080048, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9 = 0x00080049, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA = 0x0008004A, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB = 0x0008004B, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC = 0x0008004C, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD = 0x0008004D, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE = 0x0008004E, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF = 0x0008004F, + HV_X64_REGISTER_MSR_MTRR_FIX64K00000 = 0x00080070, + HV_X64_REGISTER_MSR_MTRR_FIX16K80000 = 0x00080071, + HV_X64_REGISTER_MSR_MTRR_FIX16KA0000 = 0x00080072, + HV_X64_REGISTER_MSR_MTRR_FIX4KC0000 = 0x00080073, + HV_X64_REGISTER_MSR_MTRR_FIX4KC8000 = 0x00080074, + HV_X64_REGISTER_MSR_MTRR_FIX4KD0000 = 0x00080075, + HV_X64_REGISTER_MSR_MTRR_FIX4KD8000 = 0x00080076, + HV_X64_REGISTER_MSR_MTRR_FIX4KE0000 = 0x00080077, + HV_X64_REGISTER_MSR_MTRR_FIX4KE8000 = 0x00080078, + HV_X64_REGISTER_MSR_MTRR_FIX4KF0000 = 0x00080079, + HV_X64_REGISTER_MSR_MTRR_FIX4KF8000 = 0x0008007A, + + HV_X64_REGISTER_TSC_AUX = 0x0008007B, + HV_X64_REGISTER_BNDCFGS = 0x0008007C, + HV_X64_REGISTER_DEBUG_CTL = 0x0008007D, + + /* Available */ + + HV_X64_REGISTER_SPEC_CTRL = 0x00080084, + HV_X64_REGISTER_TSC_ADJUST = 0x00080096, + + /* Other MSRs */ + HV_X64_REGISTER_MSR_IA32_MISC_ENABLE = 0x000800A0, + + /* Misc */ + HV_REGISTER_GUEST_OS_ID = 0x00090002, + HV_REGISTER_REFERENCE_TSC = 0x00090017, + + /* Hypervisor-defined Registers (Synic) */ + HV_REGISTER_SINT0 = 0x000A0000, + HV_REGISTER_SINT1 = 0x000A0001, + HV_REGISTER_SINT2 = 0x000A0002, + HV_REGISTER_SINT3 = 0x000A0003, + HV_REGISTER_SINT4 = 0x000A0004, + HV_REGISTER_SINT5 = 0x000A0005, + HV_REGISTER_SINT6 = 0x000A0006, + HV_REGISTER_SINT7 = 0x000A0007, + HV_REGISTER_SINT8 = 0x000A0008, + HV_REGISTER_SINT9 = 0x000A0009, + HV_REGISTER_SINT10 = 0x000A000A, + HV_REGISTER_SINT11 = 0x000A000B, + HV_REGISTER_SINT12 = 0x000A000C, + HV_REGISTER_SINT13 = 0x000A000D, + HV_REGISTER_SINT14 = 0x000A000E, + HV_REGISTER_SINT15 = 0x000A000F, + HV_REGISTER_SCONTROL = 0x000A0010, + HV_REGISTER_SVERSION = 0x000A0011, + HV_REGISTER_SIEFP = 0x000A0012, + HV_REGISTER_SIMP = 0x000A0013, + HV_REGISTER_EOM = 0x000A0014, + HV_REGISTER_SIRBP = 0x000A0015, +} hv_register_name; + +enum hv_intercept_type { + HV_INTERCEPT_TYPE_X64_IO_PORT = 0X00000000, + HV_INTERCEPT_TYPE_X64_MSR = 0X00000001, + HV_INTERCEPT_TYPE_X64_CPUID = 0X00000002, + HV_INTERCEPT_TYPE_EXCEPTION = 0X00000003, + + /* Used to be HV_INTERCEPT_TYPE_REGISTER */ + HV_INTERCEPT_TYPE_RESERVED0 = 0X00000004, + HV_INTERCEPT_TYPE_MMIO = 0X00000005, + HV_INTERCEPT_TYPE_X64_GLOBAL_CPUID = 0X00000006, + HV_INTERCEPT_TYPE_X64_APIC_SMI = 0X00000007, + HV_INTERCEPT_TYPE_HYPERCALL = 0X00000008, + + HV_INTERCEPT_TYPE_X64_APIC_INIT_SIPI = 0X00000009, + HV_INTERCEPT_MC_UPDATE_PATCH_LEVEL_MSR_READ = 0X0000000A, + + HV_INTERCEPT_TYPE_X64_APIC_WRITE = 0X0000000B, + HV_INTERCEPT_TYPE_X64_MSR_INDEX = 0X0000000C, + HV_INTERCEPT_TYPE_MAX, + HV_INTERCEPT_TYPE_INVALID = 0XFFFFFFFF, +}; + +struct hv_u128 { + uint64_t low_part; + uint64_t high_part; +}; + +union hv_x64_xmm_control_status_register { + struct hv_u128 as_uint128; + struct { + union { + /* long mode */ + uint64_t last_fp_rdp; + /* 32 bit mode */ + struct { + uint32_t last_fp_dp; + uint16_t last_fp_ds; + uint16_t padding; + }; + }; + uint32_t xmm_status_control; + uint32_t xmm_status_control_mask; + }; +}; + +union hv_x64_fp_register { + struct hv_u128 as_uint128; + struct { + uint64_t mantissa; + uint64_t biased_exponent:15; + uint64_t sign:1; + uint64_t reserved:48; + }; +}; + +union hv_x64_pending_exception_event { + uint64_t as_uint64[2]; + struct { + uint32_t event_pending:1; + uint32_t event_type:3; + uint32_t reserved0:4; + uint32_t deliver_error_code:1; + uint32_t reserved1:7; + uint32_t vector:16; + uint32_t error_code; + uint64_t exception_parameter; + }; +}; + +union hv_x64_pending_virtualization_fault_event { + uint64_t as_uint64[2]; + struct { + uint32_t event_pending:1; + uint32_t event_type:3; + uint32_t reserved0:4; + uint32_t reserved1:8; + uint32_t parameter0:16; + uint32_t code; + uint64_t parameter1; + }; +}; + +union hv_x64_pending_interruption_register { + uint64_t as_uint64; + struct { + uint32_t interruption_pending:1; + uint32_t interruption_type:3; + uint32_t deliver_error_code:1; + uint32_t instruction_length:4; + uint32_t nested_event:1; + uint32_t reserved:6; + uint32_t interruption_vector:16; + uint32_t error_code; + }; +}; + +union hv_x64_register_sev_control { + uint64_t as_uint64; + struct { + uint64_t enable_encrypted_state:1; + uint64_t reserved_z:11; + uint64_t vmsa_gpa_page_number:52; + }; +}; + +union hv_x64_msr_npiep_config_contents { + uint64_t as_uint64; + struct { + /* + * These bits enable instruction execution prevention for + * specific instructions. + */ + uint64_t prevents_gdt:1; + uint64_t prevents_idt:1; + uint64_t prevents_ldt:1; + uint64_t prevents_tr:1; + + /* The reserved bits must always be 0. */ + uint64_t reserved:60; + }; +}; + +typedef struct hv_x64_segment_register { + uint64_t base; + uint32_t limit; + uint16_t selector; + union { + struct { + uint16_t segment_type:4; + uint16_t non_system_segment:1; + uint16_t descriptor_privilege_level:2; + uint16_t present:1; + uint16_t reserved:4; + uint16_t available:1; + uint16_t _long:1; + uint16_t _default:1; + uint16_t granularity:1; + }; + uint16_t attributes; + }; +} hv_x64_segment_register; + +typedef struct hv_x64_table_register { + uint16_t pad[3]; + uint16_t limit; + uint64_t base; +} hv_x64_table_register; + +union hv_x64_fp_control_status_register { + struct hv_u128 as_uint128; + struct { + uint16_t fp_control; + uint16_t fp_status; + uint8_t fp_tag; + uint8_t reserved; + uint16_t last_fp_op; + union { + /* long mode */ + uint64_t last_fp_rip; + /* 32 bit mode */ + struct { + uint32_t last_fp_eip; + uint16_t last_fp_cs; + uint16_t padding; + }; + }; + }; +}; + +/* General Hypervisor Register Content Definitions */ + +union hv_explicit_suspend_register { + uint64_t as_uint64; + struct { + uint64_t suspended:1; + uint64_t reserved:63; + }; +}; + +union hv_internal_activity_register { + uint64_t as_uint64; + + struct { + uint64_t startup_suspend:1; + uint64_t halt_suspend:1; + uint64_t idle_suspend:1; + uint64_t rsvd_z:61; + }; +}; + +union hv_x64_interrupt_state_register { + uint64_t as_uint64; + struct { + uint64_t interrupt_shadow:1; + uint64_t nmi_masked:1; + uint64_t reserved:62; + }; +}; + +union hv_intercept_suspend_register { + uint64_t as_uint64; + struct { + uint64_t suspended:1; + uint64_t reserved:63; + }; +}; + +typedef union hv_register_value { + struct hv_u128 reg128; + uint64_t reg64; + uint32_t reg32; + uint16_t reg16; + uint8_t reg8; + union hv_x64_fp_register fp; + union hv_x64_fp_control_status_register fp_control_status; + union hv_x64_xmm_control_status_register xmm_control_status; + struct hv_x64_segment_register segment; + struct hv_x64_table_register table; + union hv_explicit_suspend_register explicit_suspend; + union hv_intercept_suspend_register intercept_suspend; + union hv_internal_activity_register internal_activity; + union hv_x64_interrupt_state_register interrupt_state; + union hv_x64_pending_interruption_register pending_interruption; + union hv_x64_msr_npiep_config_contents npiep_config; + union hv_x64_pending_exception_event pending_exception_event; + union hv_x64_pending_virtualization_fault_event + pending_virtualization_fault_event; + union hv_x64_register_sev_control sev_control; +} hv_register_value; + +typedef struct hv_register_assoc { + uint32_t name; /* enum hv_register_name */ + uint32_t reserved1; + uint64_t reserved2; + union hv_register_value value; +} hv_register_assoc; + +union hv_input_vtl { + uint8_t as_uint8; + struct { + uint8_t target_vtl:4; + uint8_t use_target_vtl:1; + uint8_t reserved_z:3; + }; +}; + +typedef struct hv_input_get_vp_registers { + uint64_t partition_id; + uint32_t vp_index; + union hv_input_vtl input_vtl; + uint8_t rsvd_z8; + uint16_t rsvd_z16; + uint32_t names[]; +} hv_input_get_vp_registers; + +typedef struct hv_input_set_vp_registers { + uint64_t partition_id; + uint32_t vp_index; + union hv_input_vtl input_vtl; + uint8_t rsvd_z8; + uint16_t rsvd_z16; + struct hv_register_assoc elements[]; +} hv_input_set_vp_registers; + +#define MSHV_VP_MAX_REGISTERS 128 + +struct mshv_vp_registers { + int count; /* at most MSHV_VP_MAX_REGISTERS */ + struct hv_register_assoc *regs; +}; + +union hv_interrupt_control { + uint64_t as_uint64; + struct { + uint32_t interrupt_type; /* enum hv_interrupt type */ + uint32_t level_triggered:1; + uint32_t logical_dest_mode:1; + uint32_t rsvd:30; + }; +}; + +struct hv_input_assert_virtual_interrupt { + uint64_t partition_id; + union hv_interrupt_control control; + uint64_t dest_addr; /* cpu's apic id */ + uint32_t vector; + uint8_t target_vtl; + uint8_t rsvd_z0; + uint16_t rsvd_z1; +}; + +/* /dev/mshv */ +#define MSHV_CREATE_PARTITION _IOW(MSHV_IOCTL, 0x00, struct mshv_create_partition) +#define MSHV_CREATE_VP _IOW(MSHV_IOCTL, 0x01, struct mshv_create_vp) + +/* Partition fds created with MSHV_CREATE_PARTITION */ +#define MSHV_INITIALIZE_PARTITION _IO(MSHV_IOCTL, 0x00) +#define MSHV_SET_GUEST_MEMORY _IOW(MSHV_IOCTL, 0x02, struct mshv_user_mem_region) +#define MSHV_IRQFD _IOW(MSHV_IOCTL, 0x03, struct mshv_user_irqfd) +#define MSHV_IOEVENTFD _IOW(MSHV_IOCTL, 0x04, struct mshv_user_ioeventfd) +#define MSHV_SET_MSI_ROUTING _IOW(MSHV_IOCTL, 0x05, struct mshv_user_irq_table) + +/* + ******************************** + * VP APIs for child partitions * + ******************************** + */ + +struct hv_local_interrupt_controller_state { + /* HV_X64_INTERRUPT_CONTROLLER_STATE */ + uint32_t apic_id; + uint32_t apic_version; + uint32_t apic_ldr; + uint32_t apic_dfr; + uint32_t apic_spurious; + uint32_t apic_isr[8]; + uint32_t apic_tmr[8]; + uint32_t apic_irr[8]; + uint32_t apic_esr; + uint32_t apic_icr_high; + uint32_t apic_icr_low; + uint32_t apic_lvt_timer; + uint32_t apic_lvt_thermal; + uint32_t apic_lvt_perfmon; + uint32_t apic_lvt_lint0; + uint32_t apic_lvt_lint1; + uint32_t apic_lvt_error; + uint32_t apic_lvt_cmci; + uint32_t apic_error_status; + uint32_t apic_initial_count; + uint32_t apic_counter_value; + uint32_t apic_divide_configuration; + uint32_t apic_remote_read; +}; + +/* Generic hypercall */ +#define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall) + +/* From hvgdk_mini.h */ + +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 +#define HV_X64_MSR_SINT0 0x40000090 +#define HV_X64_MSR_SINT1 0x40000091 +#define HV_X64_MSR_SINT2 0x40000092 +#define HV_X64_MSR_SINT3 0x40000093 +#define HV_X64_MSR_SINT4 0x40000094 +#define HV_X64_MSR_SINT5 0x40000095 +#define HV_X64_MSR_SINT6 0x40000096 +#define HV_X64_MSR_SINT7 0x40000097 +#define HV_X64_MSR_SINT8 0x40000098 +#define HV_X64_MSR_SINT9 0x40000099 +#define HV_X64_MSR_SINT10 0x4000009A +#define HV_X64_MSR_SINT11 0x4000009B +#define HV_X64_MSR_SINT12 0x4000009C +#define HV_X64_MSR_SINT13 0x4000009D +#define HV_X64_MSR_SINT14 0x4000009E +#define HV_X64_MSR_SINT15 0x4000009F +#define HV_X64_MSR_SCONTROL 0x40000080 +#define HV_X64_MSR_SIEFP 0x40000082 +#define HV_X64_MSR_SIMP 0x40000083 +#define HV_X64_MSR_REFERENCE_TSC 0x40000021 +#define HV_X64_MSR_EOM 0x40000084 + +/* Define port identifier type. */ +union hv_port_id { + uint32_t asuint32_t; + struct { + uint32_t id:24; + uint32_t reserved:8; + }; +}; + +#define HV_MESSAGE_SIZE (256) +#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) +#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30) + +/* Define hypervisor message types. */ +enum hv_message_type { + HVMSG_NONE = 0x00000000, + + /* Memory access messages. */ + HVMSG_UNMAPPED_GPA = 0x80000000, + HVMSG_GPA_INTERCEPT = 0x80000001, + HVMSG_UNACCEPTED_GPA = 0x80000003, + HVMSG_GPA_ATTRIBUTE_INTERCEPT = 0x80000004, + + /* Timer notification messages. */ + HVMSG_TIMER_EXPIRED = 0x80000010, + + /* Error messages. */ + HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020, + HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021, + HVMSG_UNSUPPORTED_FEATURE = 0x80000022, + + /* + * Opaque intercept message. The original intercept message is only + * accessible from the mapped intercept message page. + */ + HVMSG_OPAQUE_INTERCEPT = 0x8000003F, + + /* Trace buffer complete messages. */ + HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040, + + /* Hypercall intercept */ + HVMSG_HYPERCALL_INTERCEPT = 0x80000050, + + /* SynIC intercepts */ + HVMSG_SYNIC_EVENT_INTERCEPT = 0x80000060, + HVMSG_SYNIC_SINT_INTERCEPT = 0x80000061, + HVMSG_SYNIC_SINT_DELIVERABLE = 0x80000062, + + /* Async call completion intercept */ + HVMSG_ASYNC_CALL_COMPLETION = 0x80000070, + + /* Root scheduler messages */ + HVMSG_SCHEDULER_VP_SIGNAL_BITSE = 0x80000100, + HVMSG_SCHEDULER_VP_SIGNAL_PAIR = 0x80000101, + + /* Platform-specific processor intercept messages. */ + HVMSG_X64_IO_PORT_INTERCEPT = 0x80010000, + HVMSG_X64_MSR_INTERCEPT = 0x80010001, + HVMSG_X64_CPUID_INTERCEPT = 0x80010002, + HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003, + HVMSG_X64_APIC_EOI = 0x80010004, + HVMSG_X64_LEGACY_FP_ERROR = 0x80010005, + HVMSG_X64_IOMMU_PRQ = 0x80010006, + HVMSG_X64_HALT = 0x80010007, + HVMSG_X64_INTERRUPTION_DELIVERABLE = 0x80010008, + HVMSG_X64_SIPI_INTERCEPT = 0x80010009, + HVMSG_X64_SEV_VMGEXIT_INTERCEPT = 0x80010013, +}; + +union hv_x64_vp_execution_state { + uint16_t as_uint16; + struct { + uint16_t cpl:2; + uint16_t cr0_pe:1; + uint16_t cr0_am:1; + uint16_t efer_lma:1; + uint16_t debug_active:1; + uint16_t interruption_pending:1; + uint16_t vtl:4; + uint16_t enclave_mode:1; + uint16_t interrupt_shadow:1; + uint16_t virtualization_fault_active:1; + uint16_t reserved:2; + }; +}; + +/* From openvmm::hvdef */ +enum hv_x64_intercept_access_type { + HV_X64_INTERCEPT_ACCESS_TYPE_READ = 0, + HV_X64_INTERCEPT_ACCESS_TYPE_WRITE = 1, + HV_X64_INTERCEPT_ACCESS_TYPE_EXECUTE = 2, +}; + +struct hv_x64_intercept_message_header { + uint32_t vp_index; + uint8_t instruction_length:4; + uint8_t cr8:4; /* Only set for exo partitions */ + uint8_t intercept_access_type; + union hv_x64_vp_execution_state execution_state; + struct hv_x64_segment_register cs_segment; + uint64_t rip; + uint64_t rflags; +}; + +union hv_x64_io_port_access_info { + uint8_t as_uint8; + struct { + uint8_t access_size:3; + uint8_t string_op:1; + uint8_t rep_prefix:1; + uint8_t reserved:3; + }; +}; + +typedef struct hv_x64_io_port_intercept_message { + struct hv_x64_intercept_message_header header; + uint16_t port_number; + union hv_x64_io_port_access_info access_info; + uint8_t instruction_byte_count; + uint32_t reserved; + uint64_t rax; + uint8_t instruction_bytes[16]; + struct hv_x64_segment_register ds_segment; + struct hv_x64_segment_register es_segment; + uint64_t rcx; + uint64_t rsi; + uint64_t rdi; +} hv_x64_io_port_intercept_message; + +union hv_x64_memory_access_info { + uint8_t as_uint8; + struct { + uint8_t gva_valid:1; + uint8_t gva_gpa_valid:1; + uint8_t hypercall_output_pending:1; + uint8_t tlb_locked_no_overlay:1; + uint8_t reserved:4; + }; +}; + +struct hv_x64_memory_intercept_message { + struct hv_x64_intercept_message_header header; + uint32_t cache_type; /* enum hv_cache_type */ + uint8_t instruction_byte_count; + union hv_x64_memory_access_info memory_access_info; + uint8_t tpr_priority; + uint8_t reserved1; + uint64_t guest_virtual_address; + uint64_t guest_physical_address; + uint8_t instruction_bytes[16]; +}; + +union hv_message_flags { + uint8_t asu8; + struct { + uint8_t msg_pending:1; + uint8_t reserved:7; + }; +}; + +struct hv_message_header { + uint32_t message_type; + uint8_t payload_size; + union hv_message_flags message_flags; + uint8_t reserved[2]; + union { + uint64_t sender; + union hv_port_id port; + }; +}; + +struct hv_message { + struct hv_message_header header; + union { + uint64_t payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; + } u; +}; + +/* From github.com/rust-vmm/mshv-bindings/src/x86_64/regs.rs */ + +struct hv_cpuid_entry { + uint32_t function; + uint32_t index; + uint32_t flags; + uint32_t eax; + uint32_t ebx; + uint32_t ecx; + uint32_t edx; + uint32_t padding[3]; +}; + +struct hv_cpuid { + uint32_t nent; + uint32_t padding; + struct hv_cpuid_entry entries[0]; +}; + +#define IA32_MSR_TSC 0x00000010 +#define IA32_MSR_EFER 0xC0000080 +#define IA32_MSR_KERNEL_GS_BASE 0xC0000102 +#define IA32_MSR_APIC_BASE 0x0000001B +#define IA32_MSR_PAT 0x0277 +#define IA32_MSR_SYSENTER_CS 0x00000174 +#define IA32_MSR_SYSENTER_ESP 0x00000175 +#define IA32_MSR_SYSENTER_EIP 0x00000176 +#define IA32_MSR_STAR 0xC0000081 +#define IA32_MSR_LSTAR 0xC0000082 +#define IA32_MSR_CSTAR 0xC0000083 +#define IA32_MSR_SFMASK 0xC0000084 + +#define IA32_MSR_MTRR_CAP 0x00FE +#define IA32_MSR_MTRR_DEF_TYPE 0x02FF +#define IA32_MSR_MTRR_PHYSBASE0 0x0200 +#define IA32_MSR_MTRR_PHYSMASK0 0x0201 +#define IA32_MSR_MTRR_PHYSBASE1 0x0202 +#define IA32_MSR_MTRR_PHYSMASK1 0x0203 +#define IA32_MSR_MTRR_PHYSBASE2 0x0204 +#define IA32_MSR_MTRR_PHYSMASK2 0x0205 +#define IA32_MSR_MTRR_PHYSBASE3 0x0206 +#define IA32_MSR_MTRR_PHYSMASK3 0x0207 +#define IA32_MSR_MTRR_PHYSBASE4 0x0208 +#define IA32_MSR_MTRR_PHYSMASK4 0x0209 +#define IA32_MSR_MTRR_PHYSBASE5 0x020A +#define IA32_MSR_MTRR_PHYSMASK5 0x020B +#define IA32_MSR_MTRR_PHYSBASE6 0x020C +#define IA32_MSR_MTRR_PHYSMASK6 0x020D +#define IA32_MSR_MTRR_PHYSBASE7 0x020E +#define IA32_MSR_MTRR_PHYSMASK7 0x020F + +#define IA32_MSR_MTRR_FIX64K_00000 0x0250 +#define IA32_MSR_MTRR_FIX16K_80000 0x0258 +#define IA32_MSR_MTRR_FIX16K_A0000 0x0259 +#define IA32_MSR_MTRR_FIX4K_C0000 0x0268 +#define IA32_MSR_MTRR_FIX4K_C8000 0x0269 +#define IA32_MSR_MTRR_FIX4K_D0000 0x026A +#define IA32_MSR_MTRR_FIX4K_D8000 0x026B +#define IA32_MSR_MTRR_FIX4K_E0000 0x026C +#define IA32_MSR_MTRR_FIX4K_E8000 0x026D +#define IA32_MSR_MTRR_FIX4K_F0000 0x026E +#define IA32_MSR_MTRR_FIX4K_F8000 0x026F + +#define IA32_MSR_TSC_AUX 0xC0000103 +#define IA32_MSR_BNDCFGS 0x00000d90 +#define IA32_MSR_DEBUG_CTL 0x1D9 +#define IA32_MSR_SPEC_CTRL 0x00000048 +#define IA32_MSR_TSC_ADJUST 0x0000003b + +#define IA32_MSR_MISC_ENABLE 0x000001a0 + +#define HV_TRANSLATE_GVA_VALIDATE_READ (0x0001) +#define HV_TRANSLATE_GVA_VALIDATE_WRITE (0x0002) +#define HV_TRANSLATE_GVA_VALIDATE_EXECUTE (0x0004) + +#define HV_HYP_PAGE_SHIFT 12 +#define HV_HYP_PAGE_SIZE BIT(HV_HYP_PAGE_SHIFT) +#define HV_HYP_PAGE_MASK (~(HV_HYP_PAGE_SIZE - 1)) + +#define HVCALL_GET_PARTITION_PROPERTY 0x0044 +#define HVCALL_SET_PARTITION_PROPERTY 0x0045 +#define HVCALL_GET_VP_REGISTERS 0x0050 +#define HVCALL_SET_VP_REGISTERS 0x0051 +#define HVCALL_TRANSLATE_VIRTUAL_ADDRESS 0x0052 +#define HVCALL_REGISTER_INTERCEPT_RESULT 0x0091 +#define HVCALL_ASSERT_VIRTUAL_INTERRUPT 0x0094 + +#endif /* HW_HYPERV_HVGDK_MINI_H */ diff --git a/include/hw/hyperv/hvhdk.h b/include/hw/hyperv/hvhdk.h new file mode 100644 index 0000000000..866c8211bf --- /dev/null +++ b/include/hw/hyperv/hvhdk.h @@ -0,0 +1,249 @@ +/* + * Type definitions for the mshv host. + * + * Copyright Microsoft, Corp. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_HYPERV_HVHDK_H +#define HW_HYPERV_HVHDK_H + +#define HV_PARTITION_SYNTHETIC_PROCESSOR_FEATURES_BANKS 1 + +struct hv_input_set_partition_property { + uint64_t partition_id; + uint32_t property_code; /* enum hv_partition_property_code */ + uint32_t padding; + uint64_t property_value; +}; + +union hv_partition_synthetic_processor_features { + uint64_t as_uint64[HV_PARTITION_SYNTHETIC_PROCESSOR_FEATURES_BANKS]; + + struct { + /* + * Report a hypervisor is present. CPUID leaves + * 0x40000000 and 0x40000001 are supported. + */ + uint64_t hypervisor_present:1; + + /* + * Features associated with HV#1: + */ + + /* Report support for Hv1 (CPUID leaves 0x40000000 - 0x40000006). */ + uint64_t hv1:1; + + /* + * Access to HV_X64_MSR_VP_RUNTIME. + * Corresponds to access_vp_run_time_reg privilege. + */ + uint64_t access_vp_run_time_reg:1; + + /* + * Access to HV_X64_MSR_TIME_REF_COUNT. + * Corresponds to access_partition_reference_counter privilege. + */ + uint64_t access_partition_reference_counter:1; + + /* + * Access to SINT-related registers (HV_X64_MSR_SCONTROL through + * HV_X64_MSR_EOM and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15). + * Corresponds to access_synic_regs privilege. + */ + uint64_t access_synic_regs:1; + + /* + * Access to synthetic timers and associated MSRs + * (HV_X64_MSR_STIMER0_CONFIG through HV_X64_MSR_STIMER3_COUNT). + * Corresponds to access_synthetic_timer_regs privilege. + */ + uint64_t access_synthetic_timer_regs:1; + + /* + * Access to APIC MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and + * HV_X64_MSR_TPR) as well as the VP assist page. + * Corresponds to access_intr_ctrl_regs privilege. + */ + uint64_t access_intr_ctrl_regs:1; + + /* + * Access to registers associated with hypercalls + * (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL). + * Corresponds to access_hypercall_msrs privilege. + */ + uint64_t access_hypercall_regs:1; + + /* VP index can be queried. corresponds to access_vp_index privilege. */ + uint64_t access_vp_index:1; + + /* + * Access to the reference TSC. Corresponds to + * access_partition_reference_tsc privilege. + */ + uint64_t access_partition_reference_tsc:1; + + /* + * Partition has access to the guest idle reg. Corresponds to + * access_guest_idle_reg privilege. + */ + uint64_t access_guest_idle_reg:1; + + /* + * Partition has access to frequency regs. corresponds to + * access_frequency_regs privilege. + */ + uint64_t access_frequency_regs:1; + + uint64_t reserved_z12:1; /* Reserved for access_reenlightenment_controls */ + uint64_t reserved_z13:1; /* Reserved for access_root_scheduler_reg */ + uint64_t reserved_z14:1; /* Reserved for access_tsc_invariant_controls */ + + /* + * Extended GVA ranges for HvCallFlushVirtualAddressList hypercall. + * Corresponds to privilege. + */ + uint64_t enable_extended_gva_ranges_for_flush_virtual_address_list:1; + + uint64_t reserved_z16:1; /* Reserved for access_vsm. */ + uint64_t reserved_z17:1; /* Reserved for access_vp_registers. */ + + /* Use fast hypercall output. Corresponds to privilege. */ + uint64_t fast_hypercall_output:1; + + uint64_t reserved_z19:1; /* Reserved for enable_extended_hypercalls. */ + + /* + * HvStartVirtualProcessor can be used to start virtual processors. + * Corresponds to privilege. + */ + uint64_t start_virtual_processor:1; + + uint64_t reserved_z21:1; /* Reserved for Isolation. */ + + /* Synthetic timers in direct mode. */ + uint64_t direct_synthetic_timers:1; + + uint64_t reserved_z23:1; /* Reserved for synthetic time unhalted timer */ + + /* Use extended processor masks. */ + uint64_t extended_processor_masks:1; + + /* + * HvCallFlushVirtualAddressSpace / HvCallFlushVirtualAddressList are + * supported. + */ + uint64_t tb_flush_hypercalls:1; + + /* HvCallSendSyntheticClusterIpi is supported. */ + uint64_t synthetic_cluster_ipi:1; + + /* HvCallNotifyLongSpinWait is supported. */ + uint64_t notify_long_spin_wait:1; + + /* HvCallQueryNumaDistance is supported. */ + uint64_t query_numa_distance:1; + + /* HvCallSignalEvent is supported. Corresponds to privilege. */ + uint64_t signal_events:1; + + /* HvCallRetargetDeviceInterrupt is supported. */ + uint64_t retarget_device_interrupt:1; + + /* HvCallRestorePartitionTime is supported. */ + uint64_t restore_time:1; + + /* EnlightenedVmcs nested enlightenment is supported. */ + uint64_t enlightened_vmcs:1; + + uint64_t reserved:30; + }; +}; + +enum hv_translate_gva_result_code { + HV_TRANSLATE_GVA_SUCCESS = 0, + + /* Translation failures. */ + HV_TRANSLATE_GVA_PAGE_NOT_PRESENT = 1, + HV_TRANSLATE_GVA_PRIVILEGE_VIOLATION = 2, + HV_TRANSLATE_GVA_INVALIDE_PAGE_TABLE_FLAGS = 3, + + /* GPA access failures. */ + HV_TRANSLATE_GVA_GPA_UNMAPPED = 4, + HV_TRANSLATE_GVA_GPA_NO_READ_ACCESS = 5, + HV_TRANSLATE_GVA_GPA_NO_WRITE_ACCESS = 6, + HV_TRANSLATE_GVA_GPA_ILLEGAL_OVERLAY_ACCESS = 7, + + /* + * Intercept for memory access by either + * - a higher VTL + * - a nested hypervisor (due to a violation of the nested page table) + */ + HV_TRANSLATE_GVA_INTERCEPT = 8, + + HV_TRANSLATE_GVA_GPA_UNACCEPTED = 9, +}; + +union hv_translate_gva_result { + uint64_t as_uint64; + struct { + uint32_t result_code; /* enum hv_translate_hva_result_code */ + uint32_t cache_type:8; + uint32_t overlay_page:1; + uint32_t reserved:23; + }; +}; + +typedef struct hv_input_translate_virtual_address { + uint64_t partition_id; + uint32_t vp_index; + uint32_t padding; + uint64_t control_flags; + uint64_t gva_page; +} hv_input_translate_virtual_address; + +typedef struct hv_output_translate_virtual_address { + union hv_translate_gva_result translation_result; + uint64_t gpa_page; +} hv_output_translate_virtual_address; + +typedef struct hv_register_x64_cpuid_result_parameters { + struct { + uint32_t eax; + uint32_t ecx; + uint8_t subleaf_specific; + uint8_t always_override; + uint16_t padding; + } input; + struct { + uint32_t eax; + uint32_t eax_mask; + uint32_t ebx; + uint32_t ebx_mask; + uint32_t ecx; + uint32_t ecx_mask; + uint32_t edx; + uint32_t edx_mask; + } result; +} hv_register_x64_cpuid_result_parameters; + +typedef struct hv_register_x64_msr_result_parameters { + uint32_t msr_index; + uint32_t access_type; + uint32_t action; /* enum hv_unimplemented_msr_action */ +} hv_register_x64_msr_result_parameters; + +union hv_register_intercept_result_parameters { + struct hv_register_x64_cpuid_result_parameters cpuid; + struct hv_register_x64_msr_result_parameters msr; +}; + +typedef struct hv_input_register_intercept_result { + uint64_t partition_id; + uint32_t vp_index; + uint32_t intercept_type; /* enum hv_intercept_type */ + union hv_register_intercept_result_parameters parameters; +} hv_input_register_intercept_result; + +#endif /* HW_HYPERV_HVHDK_H */ diff --git a/include/hw/hyperv/hvhdk_mini.h b/include/hw/hyperv/hvhdk_mini.h new file mode 100644 index 0000000000..9c2f3cf5ae --- /dev/null +++ b/include/hw/hyperv/hvhdk_mini.h @@ -0,0 +1,102 @@ +/* + * Type definitions for the mshv host interface. + * + * Copyright Microsoft, Corp. 2025 + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_HYPERV_HVHDK_MINI_H +#define HW_HYPERV_HVHDK_MINI_H + +#define HVHVK_MINI_VERSION (25294) + +/* Each generic set contains 64 elements */ +#define HV_GENERIC_SET_SHIFT (6) +#define HV_GENERIC_SET_MASK (63) + +enum hv_generic_set_format { + HV_GENERIC_SET_SPARSE_4K, + HV_GENERIC_SET_ALL, +}; + +enum hv_partition_property_code { + /* Privilege properties */ + HV_PARTITION_PROPERTY_PRIVILEGE_FLAGS = 0x00010000, + HV_PARTITION_PROPERTY_SYNTHETIC_PROC_FEATURES = 0x00010001, + + /* Scheduling properties */ + HV_PARTITION_PROPERTY_SUSPEND = 0x00020000, + HV_PARTITION_PROPERTY_CPU_RESERVE = 0x00020001, + HV_PARTITION_PROPERTY_CPU_CAP = 0x00020002, + HV_PARTITION_PROPERTY_CPU_WEIGHT = 0x00020003, + HV_PARTITION_PROPERTY_CPU_GROUP_ID = 0x00020004, + + /* Time properties */ + HV_PARTITION_PROPERTY_TIME_FREEZE = 0x00030003, + HV_PARTITION_PROPERTY_REFERENCE_TIME = 0x00030005, + + /* Debugging properties */ + HV_PARTITION_PROPERTY_DEBUG_CHANNEL_ID = 0x00040000, + + /* Resource properties */ + HV_PARTITION_PROPERTY_VIRTUAL_TLB_PAGE_COUNT = 0x00050000, + HV_PARTITION_PROPERTY_VSM_CONFIG = 0x00050001, + HV_PARTITION_PROPERTY_ZERO_MEMORY_ON_RESET = 0x00050002, + HV_PARTITION_PROPERTY_PROCESSORS_PER_SOCKET = 0x00050003, + HV_PARTITION_PROPERTY_NESTED_TLB_SIZE = 0x00050004, + HV_PARTITION_PROPERTY_GPA_PAGE_ACCESS_TRACKING = 0x00050005, + HV_PARTITION_PROPERTY_VSM_PERMISSIONS_DIRTY_SINCE_LAST_QUERY = 0x00050006, + HV_PARTITION_PROPERTY_SGX_LAUNCH_CONTROL_CONFIG = 0x00050007, + HV_PARTITION_PROPERTY_DEFAULT_SGX_LAUNCH_CONTROL0 = 0x00050008, + HV_PARTITION_PROPERTY_DEFAULT_SGX_LAUNCH_CONTROL1 = 0x00050009, + HV_PARTITION_PROPERTY_DEFAULT_SGX_LAUNCH_CONTROL2 = 0x0005000a, + HV_PARTITION_PROPERTY_DEFAULT_SGX_LAUNCH_CONTROL3 = 0x0005000b, + HV_PARTITION_PROPERTY_ISOLATION_STATE = 0x0005000c, + HV_PARTITION_PROPERTY_ISOLATION_CONTROL = 0x0005000d, + HV_PARTITION_PROPERTY_ALLOCATION_ID = 0x0005000e, + HV_PARTITION_PROPERTY_MONITORING_ID = 0x0005000f, + HV_PARTITION_PROPERTY_IMPLEMENTED_PHYSICAL_ADDRESS_BITS = 0x00050010, + HV_PARTITION_PROPERTY_NON_ARCHITECTURAL_CORE_SHARING = 0x00050011, + HV_PARTITION_PROPERTY_HYPERCALL_DOORBELL_PAGE = 0x00050012, + HV_PARTITION_PROPERTY_ISOLATION_POLICY = 0x00050014, + HV_PARTITION_PROPERTY_UNIMPLEMENTED_MSR_ACTION = 0x00050017, + HV_PARTITION_PROPERTY_SEV_VMGEXIT_OFFLOADS = 0x00050022, + + /* Compatibility properties */ + HV_PARTITION_PROPERTY_PROCESSOR_VENDOR = 0x00060000, + HV_PARTITION_PROPERTY_PROCESSOR_FEATURES_DEPRECATED = 0x00060001, + HV_PARTITION_PROPERTY_PROCESSOR_XSAVE_FEATURES = 0x00060002, + HV_PARTITION_PROPERTY_PROCESSOR_CL_FLUSH_SIZE = 0x00060003, + HV_PARTITION_PROPERTY_ENLIGHTENMENT_MODIFICATIONS = 0x00060004, + HV_PARTITION_PROPERTY_COMPATIBILITY_VERSION = 0x00060005, + HV_PARTITION_PROPERTY_PHYSICAL_ADDRESS_WIDTH = 0x00060006, + HV_PARTITION_PROPERTY_XSAVE_STATES = 0x00060007, + HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE = 0x00060008, + HV_PARTITION_PROPERTY_PROCESSOR_CLOCK_FREQUENCY = 0x00060009, + HV_PARTITION_PROPERTY_PROCESSOR_FEATURES0 = 0x0006000a, + HV_PARTITION_PROPERTY_PROCESSOR_FEATURES1 = 0x0006000b, + + /* Guest software properties */ + HV_PARTITION_PROPERTY_GUEST_OS_ID = 0x00070000, + + /* Nested virtualization properties */ + HV_PARTITION_PROPERTY_PROCESSOR_VIRTUALIZATION_FEATURES = 0x00080000, +}; + +/* HV Map GPA (Guest Physical Address) Flags */ +#define HV_MAP_GPA_PERMISSIONS_NONE 0x0 +#define HV_MAP_GPA_READABLE 0x1 +#define HV_MAP_GPA_WRITABLE 0x2 +#define HV_MAP_GPA_KERNEL_EXECUTABLE 0x4 +#define HV_MAP_GPA_USER_EXECUTABLE 0x8 +#define HV_MAP_GPA_EXECUTABLE 0xC +#define HV_MAP_GPA_PERMISSIONS_MASK 0xF +#define HV_MAP_GPA_ADJUSTABLE 0x8000 +#define HV_MAP_GPA_NO_ACCESS 0x10000 +#define HV_MAP_GPA_NOT_CACHED 0x200000 +#define HV_MAP_GPA_LARGE_PAGE 0x80000000 + +#define HV_PFN_RNG_PAGEBITS 24 /* HV_SPA_PAGE_RANGE_ADDITIONAL_PAGES_BITS */ + +#endif /* HW_HYPERV_HVHDK_MINI_H */ diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh index 717c379f9e..828a7809f7 100755 --- a/scripts/update-linux-headers.sh +++ b/scripts/update-linux-headers.sh @@ -195,7 +195,7 @@ rm -rf "$output/linux-headers/linux" mkdir -p "$output/linux-headers/linux" for header in const.h stddef.h kvm.h vfio.h vfio_ccw.h vfio_zdev.h vhost.h \ psci.h psp-sev.h userfaultfd.h memfd.h mman.h nvme_ioctl.h \ - vduse.h iommufd.h bits.h; do + vduse.h iommufd.h bits.h mshv.h; do cp "$hdrdir/include/linux/$header" "$output/linux-headers/linux" done From a6d6878650a0f4982918e8cd9d7ea6c3c3c681f7 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:26 +0200 Subject: [PATCH 12/35] linux-headers/linux: Add mshv.h headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This file has been added to the tree by running `update-linux-header.sh` on linux v6.16. Signed-off-by: Magnus Kulke Reviewed-by: Daniel P. Berrangé Link: https://lore.kernel.org/r/20250916164847.77883-7-magnuskulke@linux.microsoft.com Signed-off-by: Paolo Bonzini --- linux-headers/linux/mshv.h | 291 +++++++++++++++++++++++++++++++++++++ 1 file changed, 291 insertions(+) create mode 100644 linux-headers/linux/mshv.h diff --git a/linux-headers/linux/mshv.h b/linux-headers/linux/mshv.h new file mode 100644 index 0000000000..5bc83db6a3 --- /dev/null +++ b/linux-headers/linux/mshv.h @@ -0,0 +1,291 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Userspace interfaces for /dev/mshv* devices and derived fds + * + * This file is divided into sections containing data structures and IOCTLs for + * a particular set of related devices or derived file descriptors. + * + * The IOCTL definitions are at the end of each section. They are grouped by + * device/fd, so that new IOCTLs can easily be added with a monotonically + * increasing number. + */ +#ifndef _LINUX_MSHV_H +#define _LINUX_MSHV_H + +#include + +#define MSHV_IOCTL 0xB8 + +/* + ******************************************* + * Entry point to main VMM APIs: /dev/mshv * + ******************************************* + */ + +enum { + MSHV_PT_BIT_LAPIC, + MSHV_PT_BIT_X2APIC, + MSHV_PT_BIT_GPA_SUPER_PAGES, + MSHV_PT_BIT_COUNT, +}; + +#define MSHV_PT_FLAGS_MASK ((1 << MSHV_PT_BIT_COUNT) - 1) + +enum { + MSHV_PT_ISOLATION_NONE, + MSHV_PT_ISOLATION_COUNT, +}; + +/** + * struct mshv_create_partition - arguments for MSHV_CREATE_PARTITION + * @pt_flags: Bitmask of 1 << MSHV_PT_BIT_* + * @pt_isolation: MSHV_PT_ISOLATION_* + * + * Returns a file descriptor to act as a handle to a guest partition. + * At this point the partition is not yet initialized in the hypervisor. + * Some operations must be done with the partition in this state, e.g. setting + * so-called "early" partition properties. The partition can then be + * initialized with MSHV_INITIALIZE_PARTITION. + */ +struct mshv_create_partition { + __u64 pt_flags; + __u64 pt_isolation; +}; + +/* /dev/mshv */ +#define MSHV_CREATE_PARTITION _IOW(MSHV_IOCTL, 0x00, struct mshv_create_partition) + +/* + ************************ + * Child partition APIs * + ************************ + */ + +struct mshv_create_vp { + __u32 vp_index; +}; + +enum { + MSHV_SET_MEM_BIT_WRITABLE, + MSHV_SET_MEM_BIT_EXECUTABLE, + MSHV_SET_MEM_BIT_UNMAP, + MSHV_SET_MEM_BIT_COUNT +}; + +#define MSHV_SET_MEM_FLAGS_MASK ((1 << MSHV_SET_MEM_BIT_COUNT) - 1) + +/* The hypervisor's "native" page size */ +#define MSHV_HV_PAGE_SIZE 0x1000 + +/** + * struct mshv_user_mem_region - arguments for MSHV_SET_GUEST_MEMORY + * @size: Size of the memory region (bytes). Must be aligned to + * MSHV_HV_PAGE_SIZE + * @guest_pfn: Base guest page number to map + * @userspace_addr: Base address of userspace memory. Must be aligned to + * MSHV_HV_PAGE_SIZE + * @flags: Bitmask of 1 << MSHV_SET_MEM_BIT_*. If (1 << MSHV_SET_MEM_BIT_UNMAP) + * is set, ignore other bits. + * @rsvd: MBZ + * + * Map or unmap a region of userspace memory to Guest Physical Addresses (GPA). + * Mappings can't overlap in GPA space or userspace. + * To unmap, these fields must match an existing mapping. + */ +struct mshv_user_mem_region { + __u64 size; + __u64 guest_pfn; + __u64 userspace_addr; + __u8 flags; + __u8 rsvd[7]; +}; + +enum { + MSHV_IRQFD_BIT_DEASSIGN, + MSHV_IRQFD_BIT_RESAMPLE, + MSHV_IRQFD_BIT_COUNT, +}; + +#define MSHV_IRQFD_FLAGS_MASK ((1 << MSHV_IRQFD_BIT_COUNT) - 1) + +struct mshv_user_irqfd { + __s32 fd; + __s32 resamplefd; + __u32 gsi; + __u32 flags; +}; + +enum { + MSHV_IOEVENTFD_BIT_DATAMATCH, + MSHV_IOEVENTFD_BIT_PIO, + MSHV_IOEVENTFD_BIT_DEASSIGN, + MSHV_IOEVENTFD_BIT_COUNT, +}; + +#define MSHV_IOEVENTFD_FLAGS_MASK ((1 << MSHV_IOEVENTFD_BIT_COUNT) - 1) + +struct mshv_user_ioeventfd { + __u64 datamatch; + __u64 addr; /* legal pio/mmio address */ + __u32 len; /* 1, 2, 4, or 8 bytes */ + __s32 fd; + __u32 flags; + __u8 rsvd[4]; +}; + +struct mshv_user_irq_entry { + __u32 gsi; + __u32 address_lo; + __u32 address_hi; + __u32 data; +}; + +struct mshv_user_irq_table { + __u32 nr; + __u32 rsvd; /* MBZ */ + struct mshv_user_irq_entry entries[]; +}; + +enum { + MSHV_GPAP_ACCESS_TYPE_ACCESSED, + MSHV_GPAP_ACCESS_TYPE_DIRTY, + MSHV_GPAP_ACCESS_TYPE_COUNT /* Count of enum members */ +}; + +enum { + MSHV_GPAP_ACCESS_OP_NOOP, + MSHV_GPAP_ACCESS_OP_CLEAR, + MSHV_GPAP_ACCESS_OP_SET, + MSHV_GPAP_ACCESS_OP_COUNT /* Count of enum members */ +}; + +/** + * struct mshv_gpap_access_bitmap - arguments for MSHV_GET_GPAP_ACCESS_BITMAP + * @access_type: MSHV_GPAP_ACCESS_TYPE_* - The type of access to record in the + * bitmap + * @access_op: MSHV_GPAP_ACCESS_OP_* - Allows an optional clear or set of all + * the access states in the range, after retrieving the current + * states. + * @rsvd: MBZ + * @page_count: Number of pages + * @gpap_base: Base gpa page number + * @bitmap_ptr: Output buffer for bitmap, at least (page_count + 7) / 8 bytes + * + * Retrieve a bitmap of either ACCESSED or DIRTY bits for a given range of guest + * memory, and optionally clear or set the bits. + */ +struct mshv_gpap_access_bitmap { + __u8 access_type; + __u8 access_op; + __u8 rsvd[6]; + __u64 page_count; + __u64 gpap_base; + __u64 bitmap_ptr; +}; + +/** + * struct mshv_root_hvcall - arguments for MSHV_ROOT_HVCALL + * @code: Hypercall code (HVCALL_*) + * @reps: in: Rep count ('repcount') + * out: Reps completed ('repcomp'). MBZ unless rep hvcall + * @in_sz: Size of input incl rep data. <= MSHV_HV_PAGE_SIZE + * @out_sz: Size of output buffer. <= MSHV_HV_PAGE_SIZE. MBZ if out_ptr is 0 + * @status: in: MBZ + * out: HV_STATUS_* from hypercall + * @rsvd: MBZ + * @in_ptr: Input data buffer (struct hv_input_*). If used with partition or + * vp fd, partition id field is populated by kernel. + * @out_ptr: Output data buffer (optional) + */ +struct mshv_root_hvcall { + __u16 code; + __u16 reps; + __u16 in_sz; + __u16 out_sz; + __u16 status; + __u8 rsvd[6]; + __u64 in_ptr; + __u64 out_ptr; +}; + +/* Partition fds created with MSHV_CREATE_PARTITION */ +#define MSHV_INITIALIZE_PARTITION _IO(MSHV_IOCTL, 0x00) +#define MSHV_CREATE_VP _IOW(MSHV_IOCTL, 0x01, struct mshv_create_vp) +#define MSHV_SET_GUEST_MEMORY _IOW(MSHV_IOCTL, 0x02, struct mshv_user_mem_region) +#define MSHV_IRQFD _IOW(MSHV_IOCTL, 0x03, struct mshv_user_irqfd) +#define MSHV_IOEVENTFD _IOW(MSHV_IOCTL, 0x04, struct mshv_user_ioeventfd) +#define MSHV_SET_MSI_ROUTING _IOW(MSHV_IOCTL, 0x05, struct mshv_user_irq_table) +#define MSHV_GET_GPAP_ACCESS_BITMAP _IOWR(MSHV_IOCTL, 0x06, struct mshv_gpap_access_bitmap) +/* Generic hypercall */ +#define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall) + +/* + ******************************** + * VP APIs for child partitions * + ******************************** + */ + +#define MSHV_RUN_VP_BUF_SZ 256 + +/* + * VP state pages may be mapped to userspace via mmap(). + * To specify which state page, use MSHV_VP_MMAP_OFFSET_ values multiplied by + * the system page size. + * e.g. + * long page_size = sysconf(_SC_PAGE_SIZE); + * void *reg_page = mmap(NULL, MSHV_HV_PAGE_SIZE, PROT_READ|PROT_WRITE, + * MAP_SHARED, vp_fd, + * MSHV_VP_MMAP_OFFSET_REGISTERS * page_size); + */ +enum { + MSHV_VP_MMAP_OFFSET_REGISTERS, + MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE, + MSHV_VP_MMAP_OFFSET_GHCB, + MSHV_VP_MMAP_OFFSET_COUNT +}; + +/** + * struct mshv_run_vp - argument for MSHV_RUN_VP + * @msg_buf: On success, the intercept message is copied here. It can be + * interpreted using the relevant hypervisor definitions. + */ +struct mshv_run_vp { + __u8 msg_buf[MSHV_RUN_VP_BUF_SZ]; +}; + +enum { + MSHV_VP_STATE_LAPIC, /* Local interrupt controller state (either arch) */ + MSHV_VP_STATE_XSAVE, /* XSAVE data in compacted form (x86_64) */ + MSHV_VP_STATE_SIMP, + MSHV_VP_STATE_SIEFP, + MSHV_VP_STATE_SYNTHETIC_TIMERS, + MSHV_VP_STATE_COUNT, +}; + +/** + * struct mshv_get_set_vp_state - arguments for MSHV_[GET,SET]_VP_STATE + * @type: MSHV_VP_STATE_* + * @rsvd: MBZ + * @buf_sz: in: 4k page-aligned size of buffer + * out: Actual size of data (on EINVAL, check this to see if buffer + * was too small) + * @buf_ptr: 4k page-aligned data buffer + */ +struct mshv_get_set_vp_state { + __u8 type; + __u8 rsvd[3]; + __u32 buf_sz; + __u64 buf_ptr; +}; + +/* VP fds created with MSHV_CREATE_VP */ +#define MSHV_RUN_VP _IOR(MSHV_IOCTL, 0x00, struct mshv_run_vp) +#define MSHV_GET_VP_STATE _IOWR(MSHV_IOCTL, 0x01, struct mshv_get_set_vp_state) +#define MSHV_SET_VP_STATE _IOWR(MSHV_IOCTL, 0x02, struct mshv_get_set_vp_state) +/* + * Generic hypercall + * Defined above in partition IOCTLs, avoid redefining it here + * #define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall) + */ + +#endif From d0d2918f968c55628e17e2733b799fcefb50f16b Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Thu, 2 Oct 2025 18:25:02 +0200 Subject: [PATCH 13/35] accel/mshv: Add accelerator skeleton Introduce the initial scaffold for the MSHV (Microsoft Hypervisor) accelerator backend. This includes the basic directory structure and stub implementations needed to integrate with QEMU's accelerator framework. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-8-magnuskulke@linux.microsoft.com [Move include of linux/mshv.h in the per-target section; create include/system/mshv_int.h. - Paolo] Signed-off-by: Paolo Bonzini --- accel/meson.build | 1 + accel/mshv/meson.build | 6 ++ accel/mshv/mshv-all.c | 144 ++++++++++++++++++++++++++++++++++++++ include/system/mshv.h | 12 ++++ include/system/mshv_int.h | 41 +++++++++++ 5 files changed, 204 insertions(+) create mode 100644 accel/mshv/meson.build create mode 100644 accel/mshv/mshv-all.c create mode 100644 include/system/mshv_int.h diff --git a/accel/meson.build b/accel/meson.build index 6349efe682..983dfd0bd5 100644 --- a/accel/meson.build +++ b/accel/meson.build @@ -10,6 +10,7 @@ if have_system subdir('kvm') subdir('xen') subdir('stubs') + subdir('mshv') endif # qtest diff --git a/accel/mshv/meson.build b/accel/mshv/meson.build new file mode 100644 index 0000000000..4c03ac7921 --- /dev/null +++ b/accel/mshv/meson.build @@ -0,0 +1,6 @@ +mshv_ss = ss.source_set() +mshv_ss.add(if_true: files( + 'mshv-all.c' +)) + +specific_ss.add_all(when: 'CONFIG_MSHV', if_true: mshv_ss) diff --git a/accel/mshv/mshv-all.c b/accel/mshv/mshv-all.c new file mode 100644 index 0000000000..ae12f0f58b --- /dev/null +++ b/accel/mshv/mshv-all.c @@ -0,0 +1,144 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: + * Ziqiao Zhou + * Magnus Kulke + * Jinank Jain + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/event_notifier.h" +#include "qemu/module.h" +#include "qemu/main-loop.h" +#include "hw/boards.h" + +#include "hw/hyperv/hvhdk.h" +#include "hw/hyperv/hvhdk_mini.h" +#include "hw/hyperv/hvgdk.h" +#include "linux/mshv.h" + +#include "qemu/accel.h" +#include "qemu/guest-random.h" +#include "accel/accel-ops.h" +#include "accel/accel-cpu-ops.h" +#include "system/cpus.h" +#include "system/runstate.h" +#include "system/accel-blocker.h" +#include "system/address-spaces.h" +#include "system/mshv.h" +#include "system/mshv_int.h" +#include "system/reset.h" +#include "trace.h" +#include +#include +#include + +#define TYPE_MSHV_ACCEL ACCEL_CLASS_NAME("mshv") + +DECLARE_INSTANCE_CHECKER(MshvState, MSHV_STATE, TYPE_MSHV_ACCEL) + +bool mshv_allowed; + +MshvState *mshv_state; + +static int mshv_init(AccelState *as, MachineState *ms) +{ + error_report("unimplemented"); + abort(); +} + +static void mshv_start_vcpu_thread(CPUState *cpu) +{ + error_report("unimplemented"); + abort(); +} + +static void mshv_cpu_synchronize_post_init(CPUState *cpu) +{ + error_report("unimplemented"); + abort(); +} + +static void mshv_cpu_synchronize_post_reset(CPUState *cpu) +{ + error_report("unimplemented"); + abort(); +} + +static void mshv_cpu_synchronize_pre_loadvm(CPUState *cpu) +{ + error_report("unimplemented"); + abort(); +} + +static void mshv_cpu_synchronize(CPUState *cpu) +{ + error_report("unimplemented"); + abort(); +} + +static bool mshv_cpus_are_resettable(void) +{ + error_report("unimplemented"); + abort(); +} + +static void mshv_accel_class_init(ObjectClass *oc, const void *data) +{ + AccelClass *ac = ACCEL_CLASS(oc); + + ac->name = "MSHV"; + ac->init_machine = mshv_init; + ac->allowed = &mshv_allowed; +} + +static void mshv_accel_instance_init(Object *obj) +{ + MshvState *s = MSHV_STATE(obj); + + s->vm = 0; +} + +static const TypeInfo mshv_accel_type = { + .name = TYPE_MSHV_ACCEL, + .parent = TYPE_ACCEL, + .instance_init = mshv_accel_instance_init, + .class_init = mshv_accel_class_init, + .instance_size = sizeof(MshvState), +}; + +static void mshv_accel_ops_class_init(ObjectClass *oc, const void *data) +{ + AccelOpsClass *ops = ACCEL_OPS_CLASS(oc); + + ops->create_vcpu_thread = mshv_start_vcpu_thread; + ops->synchronize_post_init = mshv_cpu_synchronize_post_init; + ops->synchronize_post_reset = mshv_cpu_synchronize_post_reset; + ops->synchronize_state = mshv_cpu_synchronize; + ops->synchronize_pre_loadvm = mshv_cpu_synchronize_pre_loadvm; + ops->cpus_are_resettable = mshv_cpus_are_resettable; + ops->handle_interrupt = generic_handle_interrupt; +} + +static const TypeInfo mshv_accel_ops_type = { + .name = ACCEL_OPS_NAME("mshv"), + .parent = TYPE_ACCEL_OPS, + .class_init = mshv_accel_ops_class_init, + .abstract = true, +}; + +static void mshv_type_init(void) +{ + type_register_static(&mshv_accel_type); + type_register_static(&mshv_accel_ops_type); +} + +type_init(mshv_type_init); diff --git a/include/system/mshv.h b/include/system/mshv.h index 2a504ed81f..434ea9682e 100644 --- a/include/system/mshv.h +++ b/include/system/mshv.h @@ -14,8 +14,17 @@ #ifndef QEMU_MSHV_H #define QEMU_MSHV_H +#include "qemu/osdep.h" +#include "qemu/accel.h" +#include "hw/hyperv/hyperv-proto.h" +#include "hw/hyperv/hvhdk.h" +#include "qapi/qapi-types-common.h" +#include "system/memory.h" +#include "accel/accel-ops.h" + #ifdef COMPILING_PER_TARGET #ifdef CONFIG_MSHV +#include #define CONFIG_MSHV_IS_POSSIBLE #endif #else @@ -30,6 +39,9 @@ extern bool mshv_allowed; #endif #define mshv_msi_via_irqfd_enabled() false +typedef struct MshvState MshvState; +extern MshvState *mshv_state; + /* interrupt */ int mshv_irqchip_add_msi_route(int vector, PCIDevice *dev); int mshv_irqchip_update_msi_route(int virq, MSIMessage msg, PCIDevice *dev); diff --git a/include/system/mshv_int.h b/include/system/mshv_int.h new file mode 100644 index 0000000000..132491b599 --- /dev/null +++ b/include/system/mshv_int.h @@ -0,0 +1,41 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Ziqiao Zhou + * Magnus Kulke + * Jinank Jain + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + */ + +#ifndef QEMU_MSHV_INT_H +#define QEMU_MSHV_INT_H + +struct AccelCPUState { + int cpufd; + bool dirty; +}; + +typedef struct MshvMemoryListener { + MemoryListener listener; + int as_id; +} MshvMemoryListener; + +typedef struct MshvAddressSpace { + MshvMemoryListener *ml; + AddressSpace *as; +} MshvAddressSpace; + +struct MshvState { + AccelState parent_obj; + int vm; + MshvMemoryListener memory_listener; + /* number of listeners */ + int nr_as; + MshvAddressSpace *as; +}; + +#endif From 5006ea1344d134356a9b2e1afd521cf8df0c6a85 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:28 +0200 Subject: [PATCH 14/35] accel/mshv: Register memory region listeners Add memory listener hooks for the MSHV accelerator to track guest memory regions. This enables the backend to respond to region additions, removals and will be used to manage guest memory mappings inside the hypervisor. Actually registering physical memory in the hypervisor is still stubbed out. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-9-magnuskulke@linux.microsoft.com [mshv.h/mshv_int.h split. - Paolo] Signed-off-by: Paolo Bonzini --- accel/mshv/mem.c | 25 +++++++++++++++ accel/mshv/meson.build | 1 + accel/mshv/mshv-all.c | 67 +++++++++++++++++++++++++++++++++++++-- include/system/mshv_int.h | 4 +++ 4 files changed, 95 insertions(+), 2 deletions(-) create mode 100644 accel/mshv/mem.c diff --git a/accel/mshv/mem.c b/accel/mshv/mem.c new file mode 100644 index 0000000000..9889918c31 --- /dev/null +++ b/accel/mshv/mem.c @@ -0,0 +1,25 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: + * Magnus Kulke + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "system/address-spaces.h" +#include "system/mshv.h" +#include "system/mshv_int.h" + +void mshv_set_phys_mem(MshvMemoryListener *mml, MemoryRegionSection *section, + bool add) +{ + error_report("unimplemented"); + abort(); +} + diff --git a/accel/mshv/meson.build b/accel/mshv/meson.build index 4c03ac7921..8a6beb3fb1 100644 --- a/accel/mshv/meson.build +++ b/accel/mshv/meson.build @@ -1,5 +1,6 @@ mshv_ss = ss.source_set() mshv_ss.add(if_true: files( + 'mem.c', 'mshv-all.c' )) diff --git a/accel/mshv/mshv-all.c b/accel/mshv/mshv-all.c index ae12f0f58b..a684a36677 100644 --- a/accel/mshv/mshv-all.c +++ b/accel/mshv/mshv-all.c @@ -49,10 +49,73 @@ bool mshv_allowed; MshvState *mshv_state; +static void mem_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + MshvMemoryListener *mml; + mml = container_of(listener, MshvMemoryListener, listener); + memory_region_ref(section->mr); + mshv_set_phys_mem(mml, section, true); +} + +static void mem_region_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + MshvMemoryListener *mml; + mml = container_of(listener, MshvMemoryListener, listener); + mshv_set_phys_mem(mml, section, false); + memory_region_unref(section->mr); +} + +static MemoryListener mshv_memory_listener = { + .name = "mshv", + .priority = MEMORY_LISTENER_PRIORITY_ACCEL, + .region_add = mem_region_add, + .region_del = mem_region_del, +}; + +static MemoryListener mshv_io_listener = { + .name = "mshv", .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND, + /* MSHV does not support PIO eventfd */ +}; + +static void register_mshv_memory_listener(MshvState *s, MshvMemoryListener *mml, + AddressSpace *as, int as_id, + const char *name) +{ + int i; + + mml->listener = mshv_memory_listener; + mml->listener.name = name; + memory_listener_register(&mml->listener, as); + for (i = 0; i < s->nr_as; ++i) { + if (!s->as[i].as) { + s->as[i].as = as; + s->as[i].ml = mml; + break; + } + } +} + static int mshv_init(AccelState *as, MachineState *ms) { - error_report("unimplemented"); - abort(); + MshvState *s; + s = MSHV_STATE(as); + + accel_blocker_init(); + + s->vm = 0; + + s->nr_as = 1; + s->as = g_new0(MshvAddressSpace, s->nr_as); + + mshv_state = s; + + register_mshv_memory_listener(s, &s->memory_listener, &address_space_memory, + 0, "mshv-memory"); + memory_listener_register(&mshv_io_listener, &address_space_io); + + return 0; } static void mshv_start_vcpu_thread(CPUState *cpu) diff --git a/include/system/mshv_int.h b/include/system/mshv_int.h index 132491b599..cfa177ff72 100644 --- a/include/system/mshv_int.h +++ b/include/system/mshv_int.h @@ -38,4 +38,8 @@ struct MshvState { MshvAddressSpace *as; }; +/* memory */ +void mshv_set_phys_mem(MshvMemoryListener *mml, MemoryRegionSection *section, + bool add); + #endif From c5f23bccde416aab52fe14dec6f6716a85f0cd04 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Thu, 2 Oct 2025 18:28:16 +0200 Subject: [PATCH 15/35] accel/mshv: Initialize VM partition Create the MSHV virtual machine by opening a partition and issuing the necessary ioctl to initialize it. This sets up the basic VM structure and initial configuration used by MSHV to manage guest state. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-10-magnuskulke@linux.microsoft.com [Add stubs; fix format strings for trace-events; make mshv_hvcall available only in per-target files; mshv.h/mshv_int.h split. - Paolo] Signed-off-by: Paolo Bonzini --- accel/mshv/irq.c | 399 +++++++++++++++++++++++++++++++++++ accel/mshv/mem.c | 129 ++++++++++- accel/mshv/meson.build | 1 + accel/mshv/mshv-all.c | 326 ++++++++++++++++++++++++++++ accel/mshv/trace-events | 26 +++ accel/mshv/trace.h | 14 ++ accel/stubs/meson.build | 1 + accel/stubs/mshv-stub.c | 44 ++++ hw/intc/apic.c | 8 + include/system/mshv.h | 11 +- include/system/mshv_int.h | 25 +++ meson.build | 1 + target/i386/mshv/meson.build | 1 + target/i386/mshv/mshv-cpu.c | 72 +++++++ 14 files changed, 1054 insertions(+), 4 deletions(-) create mode 100644 accel/mshv/irq.c create mode 100644 accel/mshv/trace-events create mode 100644 accel/mshv/trace.h create mode 100644 accel/stubs/mshv-stub.c create mode 100644 target/i386/mshv/mshv-cpu.c diff --git a/accel/mshv/irq.c b/accel/mshv/irq.c new file mode 100644 index 0000000000..adf8f337d9 --- /dev/null +++ b/accel/mshv/irq.c @@ -0,0 +1,399 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Ziqiao Zhou + * Magnus Kulke + * Stanislav Kinsburskii + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "linux/mshv.h" +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "hw/hyperv/hvhdk_mini.h" +#include "hw/hyperv/hvgdk_mini.h" +#include "hw/intc/ioapic.h" +#include "hw/pci/msi.h" +#include "system/mshv.h" +#include "system/mshv_int.h" +#include "trace.h" +#include +#include + +#define MSHV_IRQFD_RESAMPLE_FLAG (1 << MSHV_IRQFD_BIT_RESAMPLE) +#define MSHV_IRQFD_BIT_DEASSIGN_FLAG (1 << MSHV_IRQFD_BIT_DEASSIGN) + +static MshvMsiControl *msi_control; +static QemuMutex msi_control_mutex; + +void mshv_init_msicontrol(void) +{ + qemu_mutex_init(&msi_control_mutex); + msi_control = g_new0(MshvMsiControl, 1); + msi_control->gsi_routes = g_hash_table_new(g_direct_hash, g_direct_equal); + msi_control->updated = false; +} + +static int set_msi_routing(uint32_t gsi, uint64_t addr, uint32_t data) +{ + struct mshv_user_irq_entry *entry; + uint32_t high_addr = addr >> 32; + uint32_t low_addr = addr & 0xFFFFFFFF; + GHashTable *gsi_routes; + + trace_mshv_set_msi_routing(gsi, addr, data); + + if (gsi >= MSHV_MAX_MSI_ROUTES) { + error_report("gsi >= MSHV_MAX_MSI_ROUTES"); + return -1; + } + + assert(msi_control); + + WITH_QEMU_LOCK_GUARD(&msi_control_mutex) { + gsi_routes = msi_control->gsi_routes; + entry = g_hash_table_lookup(gsi_routes, GINT_TO_POINTER(gsi)); + + if (entry + && entry->address_hi == high_addr + && entry->address_lo == low_addr + && entry->data == data) + { + /* nothing to update */ + return 0; + } + + /* free old entry */ + g_free(entry); + + /* create new entry */ + entry = g_new0(struct mshv_user_irq_entry, 1); + entry->gsi = gsi; + entry->address_hi = high_addr; + entry->address_lo = low_addr; + entry->data = data; + + g_hash_table_insert(gsi_routes, GINT_TO_POINTER(gsi), entry); + msi_control->updated = true; + } + + return 0; +} + +static int add_msi_routing(uint64_t addr, uint32_t data) +{ + struct mshv_user_irq_entry *route_entry; + uint32_t high_addr = addr >> 32; + uint32_t low_addr = addr & 0xFFFFFFFF; + int gsi; + GHashTable *gsi_routes; + + trace_mshv_add_msi_routing(addr, data); + + assert(msi_control); + + WITH_QEMU_LOCK_GUARD(&msi_control_mutex) { + /* find an empty slot */ + gsi = 0; + gsi_routes = msi_control->gsi_routes; + while (gsi < MSHV_MAX_MSI_ROUTES) { + route_entry = g_hash_table_lookup(gsi_routes, GINT_TO_POINTER(gsi)); + if (!route_entry) { + break; + } + gsi++; + } + if (gsi >= MSHV_MAX_MSI_ROUTES) { + error_report("No empty gsi slot available"); + return -1; + } + + /* create new entry */ + route_entry = g_new0(struct mshv_user_irq_entry, 1); + route_entry->gsi = gsi; + route_entry->address_hi = high_addr; + route_entry->address_lo = low_addr; + route_entry->data = data; + + g_hash_table_insert(gsi_routes, GINT_TO_POINTER(gsi), route_entry); + msi_control->updated = true; + } + + return gsi; +} + +static int commit_msi_routing_table(int vm_fd) +{ + guint len; + int i, ret; + size_t table_size; + struct mshv_user_irq_table *table; + GHashTableIter iter; + gpointer key, value; + + assert(msi_control); + + WITH_QEMU_LOCK_GUARD(&msi_control_mutex) { + if (!msi_control->updated) { + /* nothing to update */ + return 0; + } + + /* Calculate the size of the table */ + len = g_hash_table_size(msi_control->gsi_routes); + table_size = sizeof(struct mshv_user_irq_table) + + len * sizeof(struct mshv_user_irq_entry); + table = g_malloc0(table_size); + + g_hash_table_iter_init(&iter, msi_control->gsi_routes); + i = 0; + while (g_hash_table_iter_next(&iter, &key, &value)) { + struct mshv_user_irq_entry *entry = value; + table->entries[i] = *entry; + i++; + } + table->nr = i; + + trace_mshv_commit_msi_routing_table(vm_fd, len); + + ret = ioctl(vm_fd, MSHV_SET_MSI_ROUTING, table); + g_free(table); + if (ret < 0) { + error_report("Failed to commit msi routing table"); + return -1; + } + msi_control->updated = false; + } + return 0; +} + +static int remove_msi_routing(uint32_t gsi) +{ + struct mshv_user_irq_entry *route_entry; + GHashTable *gsi_routes; + + trace_mshv_remove_msi_routing(gsi); + + if (gsi >= MSHV_MAX_MSI_ROUTES) { + error_report("Invalid GSI: %u", gsi); + return -1; + } + + assert(msi_control); + + WITH_QEMU_LOCK_GUARD(&msi_control_mutex) { + gsi_routes = msi_control->gsi_routes; + route_entry = g_hash_table_lookup(gsi_routes, GINT_TO_POINTER(gsi)); + if (route_entry) { + g_hash_table_remove(gsi_routes, GINT_TO_POINTER(gsi)); + g_free(route_entry); + msi_control->updated = true; + } + } + + return 0; +} + +/* Pass an eventfd which is to be used for injecting interrupts from userland */ +static int irqfd(int vm_fd, int fd, int resample_fd, uint32_t gsi, + uint32_t flags) +{ + int ret; + struct mshv_user_irqfd arg = { + .fd = fd, + .resamplefd = resample_fd, + .gsi = gsi, + .flags = flags, + }; + + ret = ioctl(vm_fd, MSHV_IRQFD, &arg); + if (ret < 0) { + error_report("Failed to set irqfd: gsi=%u, fd=%d", gsi, fd); + return -1; + } + return ret; +} + +static int register_irqfd(int vm_fd, int event_fd, uint32_t gsi) +{ + int ret; + + trace_mshv_register_irqfd(vm_fd, event_fd, gsi); + + ret = irqfd(vm_fd, event_fd, 0, gsi, 0); + if (ret < 0) { + error_report("Failed to register irqfd: gsi=%u", gsi); + return -1; + } + return 0; +} + +static int register_irqfd_with_resample(int vm_fd, int event_fd, + int resample_fd, uint32_t gsi) +{ + int ret; + uint32_t flags = MSHV_IRQFD_RESAMPLE_FLAG; + + ret = irqfd(vm_fd, event_fd, resample_fd, gsi, flags); + if (ret < 0) { + error_report("Failed to register irqfd with resample: gsi=%u", gsi); + return -errno; + } + return 0; +} + +static int unregister_irqfd(int vm_fd, int event_fd, uint32_t gsi) +{ + int ret; + uint32_t flags = MSHV_IRQFD_BIT_DEASSIGN_FLAG; + + ret = irqfd(vm_fd, event_fd, 0, gsi, flags); + if (ret < 0) { + error_report("Failed to unregister irqfd: gsi=%u", gsi); + return -errno; + } + return 0; +} + +static int irqchip_update_irqfd_notifier_gsi(const EventNotifier *event, + const EventNotifier *resample, + int virq, bool add) +{ + int fd = event_notifier_get_fd(event); + int rfd = resample ? event_notifier_get_fd(resample) : -1; + int vm_fd = mshv_state->vm; + + trace_mshv_irqchip_update_irqfd_notifier_gsi(fd, rfd, virq, add); + + if (!add) { + return unregister_irqfd(vm_fd, fd, virq); + } + + if (rfd > 0) { + return register_irqfd_with_resample(vm_fd, fd, rfd, virq); + } + + return register_irqfd(vm_fd, fd, virq); +} + + +int mshv_irqchip_add_msi_route(int vector, PCIDevice *dev) +{ + MSIMessage msg = { 0, 0 }; + int virq = 0; + + if (pci_available && dev) { + msg = pci_get_msi_message(dev, vector); + virq = add_msi_routing(msg.address, le32_to_cpu(msg.data)); + } + + return virq; +} + +void mshv_irqchip_release_virq(int virq) +{ + remove_msi_routing(virq); +} + +int mshv_irqchip_update_msi_route(int virq, MSIMessage msg, PCIDevice *dev) +{ + int ret; + + ret = set_msi_routing(virq, msg.address, le32_to_cpu(msg.data)); + if (ret < 0) { + error_report("Failed to set msi routing"); + return -1; + } + + return 0; +} + +int mshv_request_interrupt(MshvState *mshv_state, uint32_t interrupt_type, uint32_t vector, + uint32_t vp_index, bool logical_dest_mode, + bool level_triggered) +{ + int ret; + int vm_fd = mshv_state->vm; + + if (vector == 0) { + warn_report("Ignoring request for interrupt vector 0"); + return 0; + } + + union hv_interrupt_control control = { + .interrupt_type = interrupt_type, + .level_triggered = level_triggered, + .logical_dest_mode = logical_dest_mode, + .rsvd = 0, + }; + + struct hv_input_assert_virtual_interrupt arg = {0}; + arg.control = control; + arg.dest_addr = (uint64_t)vp_index; + arg.vector = vector; + + struct mshv_root_hvcall args = {0}; + args.code = HVCALL_ASSERT_VIRTUAL_INTERRUPT; + args.in_sz = sizeof(arg); + args.in_ptr = (uint64_t)&arg; + + ret = mshv_hvcall(vm_fd, &args); + if (ret < 0) { + error_report("Failed to request interrupt"); + return -errno; + } + return 0; +} + +void mshv_irqchip_commit_routes(void) +{ + int ret; + int vm_fd = mshv_state->vm; + + ret = commit_msi_routing_table(vm_fd); + if (ret < 0) { + error_report("Failed to commit msi routing table"); + abort(); + } +} + +int mshv_irqchip_add_irqfd_notifier_gsi(const EventNotifier *event, + const EventNotifier *resample, + int virq) +{ + return irqchip_update_irqfd_notifier_gsi(event, resample, virq, true); +} + +int mshv_irqchip_remove_irqfd_notifier_gsi(const EventNotifier *event, + int virq) +{ + return irqchip_update_irqfd_notifier_gsi(event, NULL, virq, false); +} + +int mshv_reserve_ioapic_msi_routes(int vm_fd) +{ + int ret, gsi; + + /* + * Reserve GSI 0-23 for IOAPIC pins, to avoid conflicts of legacy + * peripherals with MSI-X devices + */ + for (gsi = 0; gsi < IOAPIC_NUM_PINS; gsi++) { + ret = add_msi_routing(0, 0); + if (ret < 0) { + error_report("Failed to reserve GSI %d", gsi); + return -1; + } + } + + ret = commit_msi_routing_table(vm_fd); + if (ret < 0) { + error_report("Failed to commit reserved IOAPIC MSI routes"); + return -1; + } + + return 0; +} diff --git a/accel/mshv/mem.c b/accel/mshv/mem.c index 9889918c31..a0a40eb333 100644 --- a/accel/mshv/mem.c +++ b/accel/mshv/mem.c @@ -12,14 +12,137 @@ #include "qemu/osdep.h" #include "qemu/error-report.h" +#include "linux/mshv.h" #include "system/address-spaces.h" #include "system/mshv.h" #include "system/mshv_int.h" +#include "exec/memattrs.h" +#include +#include "trace.h" + +static int set_guest_memory(int vm_fd, + const struct mshv_user_mem_region *region) +{ + int ret; + + ret = ioctl(vm_fd, MSHV_SET_GUEST_MEMORY, region); + if (ret < 0) { + error_report("failed to set guest memory"); + return -errno; + } + + return 0; +} + +static int map_or_unmap(int vm_fd, const MshvMemoryRegion *mr, bool map) +{ + struct mshv_user_mem_region region = {0}; + + region.guest_pfn = mr->guest_phys_addr >> MSHV_PAGE_SHIFT; + region.size = mr->memory_size; + region.userspace_addr = mr->userspace_addr; + + if (!map) { + region.flags |= (1 << MSHV_SET_MEM_BIT_UNMAP); + trace_mshv_unmap_memory(mr->userspace_addr, mr->guest_phys_addr, + mr->memory_size); + return set_guest_memory(vm_fd, ®ion); + } + + region.flags = BIT(MSHV_SET_MEM_BIT_EXECUTABLE); + if (!mr->readonly) { + region.flags |= BIT(MSHV_SET_MEM_BIT_WRITABLE); + } + + trace_mshv_map_memory(mr->userspace_addr, mr->guest_phys_addr, + mr->memory_size); + return set_guest_memory(vm_fd, ®ion); +} + +static int set_memory(const MshvMemoryRegion *mshv_mr, bool add) +{ + int ret = 0; + + if (!mshv_mr) { + error_report("Invalid mshv_mr"); + return -1; + } + + trace_mshv_set_memory(add, mshv_mr->guest_phys_addr, + mshv_mr->memory_size, + mshv_mr->userspace_addr, mshv_mr->readonly, + ret); + return map_or_unmap(mshv_state->vm, mshv_mr, add); +} + +/* + * Calculate and align the start address and the size of the section. + * Return the size. If the size is 0, the aligned section is empty. + */ +static hwaddr align_section(MemoryRegionSection *section, hwaddr *start) +{ + hwaddr size = int128_get64(section->size); + hwaddr delta, aligned; + + /* + * works in page size chunks, but the function may be called + * with sub-page size and unaligned start address. Pad the start + * address to next and truncate size to previous page boundary. + */ + aligned = ROUND_UP(section->offset_within_address_space, + qemu_real_host_page_size()); + delta = aligned - section->offset_within_address_space; + *start = aligned; + if (delta > size) { + return 0; + } + + return (size - delta) & qemu_real_host_page_mask(); +} void mshv_set_phys_mem(MshvMemoryListener *mml, MemoryRegionSection *section, bool add) { - error_report("unimplemented"); - abort(); -} + int ret = 0; + MemoryRegion *area = section->mr; + bool writable = !area->readonly && !area->rom_device; + hwaddr start_addr, mr_offset, size; + void *ram; + MshvMemoryRegion mshv_mr = {0}; + size = align_section(section, &start_addr); + trace_mshv_set_phys_mem(add, section->mr->name, start_addr); + + /* + * If the memory device is a writable non-ram area, we do not + * want to map it into the guest memory. If it is not a ROM device, + * we want to remove mshv memory mapping, so accesses will trap. + */ + if (!memory_region_is_ram(area)) { + if (writable) { + return; + } else if (!area->romd_mode) { + add = false; + } + } + + if (!size) { + return; + } + + mr_offset = section->offset_within_region + start_addr - + section->offset_within_address_space; + + ram = memory_region_get_ram_ptr(area) + mr_offset; + + mshv_mr.guest_phys_addr = start_addr; + mshv_mr.memory_size = size; + mshv_mr.readonly = !writable; + mshv_mr.userspace_addr = (uint64_t)ram; + + ret = set_memory(&mshv_mr, add); + if (ret < 0) { + error_report("Failed to set memory region"); + abort(); + } +} diff --git a/accel/mshv/meson.build b/accel/mshv/meson.build index 8a6beb3fb1..f88fc8678c 100644 --- a/accel/mshv/meson.build +++ b/accel/mshv/meson.build @@ -1,5 +1,6 @@ mshv_ss = ss.source_set() mshv_ss.add(if_true: files( + 'irq.c', 'mem.c', 'mshv-all.c' )) diff --git a/accel/mshv/mshv-all.c b/accel/mshv/mshv-all.c index a684a36677..653195c57c 100644 --- a/accel/mshv/mshv-all.c +++ b/accel/mshv/mshv-all.c @@ -7,6 +7,7 @@ * Ziqiao Zhou * Magnus Kulke * Jinank Jain + * Wei Liu * * SPDX-License-Identifier: GPL-2.0-or-later * @@ -23,6 +24,7 @@ #include "hw/hyperv/hvhdk.h" #include "hw/hyperv/hvhdk_mini.h" #include "hw/hyperv/hvgdk.h" +#include "hw/hyperv/hvgdk_mini.h" #include "linux/mshv.h" #include "qemu/accel.h" @@ -49,6 +51,175 @@ bool mshv_allowed; MshvState *mshv_state; +static int init_mshv(int *mshv_fd) +{ + int fd = open("/dev/mshv", O_RDWR | O_CLOEXEC); + if (fd < 0) { + error_report("Failed to open /dev/mshv: %s", strerror(errno)); + return -1; + } + *mshv_fd = fd; + return 0; +} + +/* freeze 1 to pause, 0 to resume */ +static int set_time_freeze(int vm_fd, int freeze) +{ + int ret; + struct hv_input_set_partition_property in = {0}; + in.property_code = HV_PARTITION_PROPERTY_TIME_FREEZE; + in.property_value = freeze; + + struct mshv_root_hvcall args = {0}; + args.code = HVCALL_SET_PARTITION_PROPERTY; + args.in_sz = sizeof(in); + args.in_ptr = (uint64_t)∈ + + ret = mshv_hvcall(vm_fd, &args); + if (ret < 0) { + error_report("Failed to set time freeze"); + return -1; + } + + return 0; +} + +static int pause_vm(int vm_fd) +{ + int ret; + + ret = set_time_freeze(vm_fd, 1); + if (ret < 0) { + error_report("Failed to pause partition: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int resume_vm(int vm_fd) +{ + int ret; + + ret = set_time_freeze(vm_fd, 0); + if (ret < 0) { + error_report("Failed to resume partition: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int create_partition(int mshv_fd, int *vm_fd) +{ + int ret; + struct mshv_create_partition args = {0}; + + /* Initialize pt_flags with the desired features */ + uint64_t pt_flags = (1ULL << MSHV_PT_BIT_LAPIC) | + (1ULL << MSHV_PT_BIT_X2APIC) | + (1ULL << MSHV_PT_BIT_GPA_SUPER_PAGES); + + /* Set default isolation type */ + uint64_t pt_isolation = MSHV_PT_ISOLATION_NONE; + + args.pt_flags = pt_flags; + args.pt_isolation = pt_isolation; + + ret = ioctl(mshv_fd, MSHV_CREATE_PARTITION, &args); + if (ret < 0) { + error_report("Failed to create partition: %s", strerror(errno)); + return -1; + } + + *vm_fd = ret; + return 0; +} + +static int set_synthetic_proc_features(int vm_fd) +{ + int ret; + struct hv_input_set_partition_property in = {0}; + union hv_partition_synthetic_processor_features features = {0}; + + /* Access the bitfield and set the desired features */ + features.hypervisor_present = 1; + features.hv1 = 1; + features.access_partition_reference_counter = 1; + features.access_synic_regs = 1; + features.access_synthetic_timer_regs = 1; + features.access_partition_reference_tsc = 1; + features.access_frequency_regs = 1; + features.access_intr_ctrl_regs = 1; + features.access_vp_index = 1; + features.access_hypercall_regs = 1; + features.tb_flush_hypercalls = 1; + features.synthetic_cluster_ipi = 1; + features.direct_synthetic_timers = 1; + + mshv_arch_amend_proc_features(&features); + + in.property_code = HV_PARTITION_PROPERTY_SYNTHETIC_PROC_FEATURES; + in.property_value = features.as_uint64[0]; + + struct mshv_root_hvcall args = {0}; + args.code = HVCALL_SET_PARTITION_PROPERTY; + args.in_sz = sizeof(in); + args.in_ptr = (uint64_t)∈ + + trace_mshv_hvcall_args("synthetic_proc_features", args.code, args.in_sz); + + ret = mshv_hvcall(vm_fd, &args); + if (ret < 0) { + error_report("Failed to set synthethic proc features"); + return -errno; + } + return 0; +} + +static int initialize_vm(int vm_fd) +{ + int ret = ioctl(vm_fd, MSHV_INITIALIZE_PARTITION); + if (ret < 0) { + error_report("Failed to initialize partition: %s", strerror(errno)); + return -1; + } + return 0; +} + +static int create_vm(int mshv_fd, int *vm_fd) +{ + int ret = create_partition(mshv_fd, vm_fd); + if (ret < 0) { + return -1; + } + + ret = set_synthetic_proc_features(*vm_fd); + if (ret < 0) { + return -1; + } + + ret = initialize_vm(*vm_fd); + if (ret < 0) { + return -1; + } + + ret = mshv_reserve_ioapic_msi_routes(*vm_fd); + if (ret < 0) { + return -1; + } + + ret = mshv_arch_post_init_vm(*vm_fd); + if (ret < 0) { + return -1; + } + + /* Always create a frozen partition */ + pause_vm(*vm_fd); + + return 0; +} + static void mem_region_add(MemoryListener *listener, MemoryRegionSection *section) { @@ -67,11 +238,124 @@ static void mem_region_del(MemoryListener *listener, memory_region_unref(section->mr); } +typedef enum { + DATAMATCH_NONE, + DATAMATCH_U32, + DATAMATCH_U64, +} DatamatchTag; + +typedef struct { + DatamatchTag tag; + union { + uint32_t u32; + uint64_t u64; + } value; +} Datamatch; + +/* flags: determine whether to de/assign */ +static int ioeventfd(int vm_fd, int event_fd, uint64_t addr, Datamatch dm, + uint32_t flags) +{ + struct mshv_user_ioeventfd args = {0}; + args.fd = event_fd; + args.addr = addr; + args.flags = flags; + + if (dm.tag == DATAMATCH_NONE) { + args.datamatch = 0; + } else { + flags |= BIT(MSHV_IOEVENTFD_BIT_DATAMATCH); + args.flags = flags; + if (dm.tag == DATAMATCH_U64) { + args.len = sizeof(uint64_t); + args.datamatch = dm.value.u64; + } else { + args.len = sizeof(uint32_t); + args.datamatch = dm.value.u32; + } + } + + return ioctl(vm_fd, MSHV_IOEVENTFD, &args); +} + +static int unregister_ioevent(int vm_fd, int event_fd, uint64_t mmio_addr) +{ + uint32_t flags = 0; + Datamatch dm = {0}; + + flags |= BIT(MSHV_IOEVENTFD_BIT_DEASSIGN); + dm.tag = DATAMATCH_NONE; + + return ioeventfd(vm_fd, event_fd, mmio_addr, dm, flags); +} + +static int register_ioevent(int vm_fd, int event_fd, uint64_t mmio_addr, + uint64_t val, bool is_64bit, bool is_datamatch) +{ + uint32_t flags = 0; + Datamatch dm = {0}; + + if (!is_datamatch) { + dm.tag = DATAMATCH_NONE; + } else if (is_64bit) { + dm.tag = DATAMATCH_U64; + dm.value.u64 = val; + } else { + dm.tag = DATAMATCH_U32; + dm.value.u32 = val; + } + + return ioeventfd(vm_fd, event_fd, mmio_addr, dm, flags); +} + +static void mem_ioeventfd_add(MemoryListener *listener, + MemoryRegionSection *section, + bool match_data, uint64_t data, + EventNotifier *e) +{ + int fd = event_notifier_get_fd(e); + int ret; + bool is_64 = int128_get64(section->size) == 8; + uint64_t addr = section->offset_within_address_space; + + trace_mshv_mem_ioeventfd_add(addr, int128_get64(section->size), data); + + ret = register_ioevent(mshv_state->vm, fd, addr, data, is_64, match_data); + + if (ret < 0) { + error_report("Failed to register ioeventfd: %s (%d)", strerror(-ret), + -ret); + abort(); + } +} + +static void mem_ioeventfd_del(MemoryListener *listener, + MemoryRegionSection *section, + bool match_data, uint64_t data, + EventNotifier *e) +{ + int fd = event_notifier_get_fd(e); + int ret; + uint64_t addr = section->offset_within_address_space; + + trace_mshv_mem_ioeventfd_del(section->offset_within_address_space, + int128_get64(section->size), data); + + ret = unregister_ioevent(mshv_state->vm, fd, addr); + if (ret < 0) { + error_report("Failed to unregister ioeventfd: %s (%d)", strerror(-ret), + -ret); + abort(); + } +} + static MemoryListener mshv_memory_listener = { .name = "mshv", .priority = MEMORY_LISTENER_PRIORITY_ACCEL, .region_add = mem_region_add, .region_del = mem_region_del, + .eventfd_add = mem_ioeventfd_add, + .eventfd_del = mem_ioeventfd_del, }; static MemoryListener mshv_io_listener = { @@ -97,15 +381,57 @@ static void register_mshv_memory_listener(MshvState *s, MshvMemoryListener *mml, } } +int mshv_hvcall(int fd, const struct mshv_root_hvcall *args) +{ + int ret = 0; + + ret = ioctl(fd, MSHV_ROOT_HVCALL, args); + if (ret < 0) { + error_report("Failed to perform hvcall: %s", strerror(errno)); + return -1; + } + return ret; +} + + static int mshv_init(AccelState *as, MachineState *ms) { MshvState *s; + int mshv_fd, vm_fd, ret; + + if (mshv_state) { + warn_report("MSHV accelerator already initialized"); + return 0; + } + s = MSHV_STATE(as); accel_blocker_init(); s->vm = 0; + ret = init_mshv(&mshv_fd); + if (ret < 0) { + return -1; + } + + mshv_init_msicontrol(); + + ret = create_vm(mshv_fd, &vm_fd); + if (ret < 0) { + close(mshv_fd); + return -1; + } + + ret = resume_vm(vm_fd); + if (ret < 0) { + close(mshv_fd); + close(vm_fd); + return -1; + } + + s->vm = vm_fd; + s->fd = mshv_fd; s->nr_as = 1; s->as = g_new0(MshvAddressSpace, s->nr_as); diff --git a/accel/mshv/trace-events b/accel/mshv/trace-events new file mode 100644 index 0000000000..6130c4abf8 --- /dev/null +++ b/accel/mshv/trace-events @@ -0,0 +1,26 @@ +# Authors: Ziqiao Zhou +# Magnus Kulke +# +# SPDX-License-Identifier: GPL-2.0-or-later + +mshv_set_memory(bool add, uint64_t gpa, uint64_t size, uint64_t user_addr, bool readonly, int ret) "add=%d gpa=0x%" PRIx64 " size=0x%" PRIx64 " user=0x%" PRIx64 " readonly=%d result=%d" +mshv_mem_ioeventfd_add(uint64_t addr, uint32_t size, uint32_t data) "addr=0x%" PRIx64 " size=%d data=0x%x" +mshv_mem_ioeventfd_del(uint64_t addr, uint32_t size, uint32_t data) "addr=0x%" PRIx64 " size=%d data=0x%x" + +mshv_hvcall_args(const char* hvcall, uint16_t code, uint16_t in_sz) "built args for '%s' code: %d in_sz: %d" + +mshv_handle_interrupt(uint32_t cpu, int mask) "cpu_index=%d mask=0x%x" +mshv_set_msi_routing(uint32_t gsi, uint64_t addr, uint32_t data) "gsi=%d addr=0x%" PRIx64 " data=0x%x" +mshv_remove_msi_routing(uint32_t gsi) "gsi=%d" +mshv_add_msi_routing(uint64_t addr, uint32_t data) "addr=0x%" PRIx64 " data=0x%x" +mshv_commit_msi_routing_table(int vm_fd, int len) "vm_fd=%d table_size=%d" +mshv_register_irqfd(int vm_fd, int event_fd, uint32_t gsi) "vm_fd=%d event_fd=%d gsi=%d" +mshv_irqchip_update_irqfd_notifier_gsi(int event_fd, int resample_fd, int virq, bool add) "event_fd=%d resample_fd=%d virq=%d add=%d" + +mshv_insn_fetch(uint64_t addr, size_t size) "gpa=0x%" PRIx64 " size=%zu" +mshv_mem_write(uint64_t addr, size_t size) "\tgpa=0x%" PRIx64 " size=%zu" +mshv_mem_read(uint64_t addr, size_t size) "\tgpa=0x%" PRIx64 " size=%zu" +mshv_map_memory(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%08" PRIx64 +mshv_unmap_memory(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%08" PRIx64 +mshv_set_phys_mem(bool add, const char *name, uint64_t gpa) "\tadd=%d name=%s gpa=0x%010" PRIx64 +mshv_handle_mmio(uint64_t gva, uint64_t gpa, uint64_t size, uint8_t access_type) "\tgva=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%" PRIx64 " access_type=%d" diff --git a/accel/mshv/trace.h b/accel/mshv/trace.h new file mode 100644 index 0000000000..0dca48f917 --- /dev/null +++ b/accel/mshv/trace.h @@ -0,0 +1,14 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: + * Ziqiao Zhou + * Magnus Kulke + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + */ + +#include "trace/trace-accel_mshv.h" diff --git a/accel/stubs/meson.build b/accel/stubs/meson.build index 9dfc4f9dda..48eccd1b86 100644 --- a/accel/stubs/meson.build +++ b/accel/stubs/meson.build @@ -5,5 +5,6 @@ system_stubs_ss.add(when: 'CONFIG_TCG', if_false: files('tcg-stub.c')) system_stubs_ss.add(when: 'CONFIG_HVF', if_false: files('hvf-stub.c')) system_stubs_ss.add(when: 'CONFIG_NVMM', if_false: files('nvmm-stub.c')) system_stubs_ss.add(when: 'CONFIG_WHPX', if_false: files('whpx-stub.c')) +system_stubs_ss.add(when: 'CONFIG_MSHV', if_false: files('mshv-stub.c')) specific_ss.add_all(when: ['CONFIG_SYSTEM_ONLY'], if_true: system_stubs_ss) diff --git a/accel/stubs/mshv-stub.c b/accel/stubs/mshv-stub.c new file mode 100644 index 0000000000..e499b199d9 --- /dev/null +++ b/accel/stubs/mshv-stub.c @@ -0,0 +1,44 @@ +/* + * QEMU MSHV stub + * + * Copyright Red Hat, Inc. 2025 + * + * Author: Paolo Bonzini + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "hw/pci/msi.h" +#include "system/mshv.h" + +bool mshv_allowed; + +int mshv_irqchip_add_msi_route(int vector, PCIDevice *dev) +{ + return -ENOSYS; +} + +void mshv_irqchip_release_virq(int virq) +{ +} + +int mshv_irqchip_update_msi_route(int virq, MSIMessage msg, PCIDevice *dev) +{ + return -ENOSYS; +} + +void mshv_irqchip_commit_routes(void) +{ +} + +int mshv_irqchip_add_irqfd_notifier_gsi(const EventNotifier *n, + const EventNotifier *rn, int virq) +{ + return -ENOSYS; +} + +int mshv_irqchip_remove_irqfd_notifier_gsi(const EventNotifier *n, int virq) +{ + return -ENOSYS; +} diff --git a/hw/intc/apic.c b/hw/intc/apic.c index bcb103560c..6d7859640c 100644 --- a/hw/intc/apic.c +++ b/hw/intc/apic.c @@ -27,6 +27,7 @@ #include "hw/pci/msi.h" #include "qemu/host-utils.h" #include "system/kvm.h" +#include "system/mshv.h" #include "trace.h" #include "hw/i386/apic-msidef.h" #include "qapi/error.h" @@ -932,6 +933,13 @@ static void apic_send_msi(MSIMessage *msi) uint8_t trigger_mode = (data >> MSI_DATA_TRIGGER_SHIFT) & 0x1; uint8_t delivery = (data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x7; /* XXX: Ignore redirection hint. */ +#ifdef CONFIG_MSHV + if (mshv_enabled()) { + mshv_request_interrupt(mshv_state, delivery, vector, dest, + dest_mode, trigger_mode); + return; + } +#endif apic_deliver_irq(dest, dest_mode, delivery, vector, trigger_mode); } diff --git a/include/system/mshv.h b/include/system/mshv.h index 434ea9682e..1011e81df4 100644 --- a/include/system/mshv.h +++ b/include/system/mshv.h @@ -31,18 +31,27 @@ #define CONFIG_MSHV_IS_POSSIBLE #endif +#define MSHV_MAX_MSI_ROUTES 4096 + +#define MSHV_PAGE_SHIFT 12 + #ifdef CONFIG_MSHV_IS_POSSIBLE extern bool mshv_allowed; #define mshv_enabled() (mshv_allowed) +#define mshv_msi_via_irqfd_enabled() mshv_enabled() #else /* CONFIG_MSHV_IS_POSSIBLE */ #define mshv_enabled() false -#endif #define mshv_msi_via_irqfd_enabled() false +#endif typedef struct MshvState MshvState; extern MshvState *mshv_state; /* interrupt */ +int mshv_request_interrupt(MshvState *mshv_state, uint32_t interrupt_type, uint32_t vector, + uint32_t vp_index, bool logical_destination_mode, + bool level_triggered); + int mshv_irqchip_add_msi_route(int vector, PCIDevice *dev); int mshv_irqchip_update_msi_route(int virq, MSIMessage msg, PCIDevice *dev); void mshv_irqchip_commit_routes(void); diff --git a/include/system/mshv_int.h b/include/system/mshv_int.h index cfa177ff72..b36124a0ea 100644 --- a/include/system/mshv_int.h +++ b/include/system/mshv_int.h @@ -36,10 +36,35 @@ struct MshvState { /* number of listeners */ int nr_as; MshvAddressSpace *as; + int fd; }; +typedef struct MshvMsiControl { + bool updated; + GHashTable *gsi_routes; +} MshvMsiControl; + +void mshv_arch_amend_proc_features( + union hv_partition_synthetic_processor_features *features); +int mshv_arch_post_init_vm(int vm_fd); + +#if defined COMPILING_PER_TARGET && defined CONFIG_MSHV_IS_POSSIBLE +int mshv_hvcall(int fd, const struct mshv_root_hvcall *args); +#endif + /* memory */ +typedef struct MshvMemoryRegion { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + bool readonly; +} MshvMemoryRegion; + void mshv_set_phys_mem(MshvMemoryListener *mml, MemoryRegionSection *section, bool add); +/* interrupt */ +void mshv_init_msicontrol(void); +int mshv_reserve_ioapic_msi_routes(int vm_fd); + #endif diff --git a/meson.build b/meson.build index 167021ed62..afaefa0172 100644 --- a/meson.build +++ b/meson.build @@ -3668,6 +3668,7 @@ if have_system trace_events_subdirs += [ 'accel/hvf', 'accel/kvm', + 'accel/mshv', 'audio', 'backends', 'backends/tpm', diff --git a/target/i386/mshv/meson.build b/target/i386/mshv/meson.build index 8ddaa7c11d..647e5dafb7 100644 --- a/target/i386/mshv/meson.build +++ b/target/i386/mshv/meson.build @@ -1,6 +1,7 @@ i386_mshv_ss = ss.source_set() i386_mshv_ss.add(files( + 'mshv-cpu.c', 'x86.c', )) diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c new file mode 100644 index 0000000000..de0c26bc6c --- /dev/null +++ b/target/i386/mshv/mshv-cpu.c @@ -0,0 +1,72 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Ziqiao Zhou + * Magnus Kulke + * Jinank Jain + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "qemu/typedefs.h" + +#include "system/mshv.h" +#include "system/mshv_int.h" +#include "system/address-spaces.h" +#include "linux/mshv.h" +#include "hw/hyperv/hvgdk.h" +#include "hw/hyperv/hvgdk_mini.h" +#include "hw/hyperv/hvhdk_mini.h" + +#include "trace-accel_mshv.h" +#include "trace.h" + +void mshv_arch_amend_proc_features( + union hv_partition_synthetic_processor_features *features) +{ + features->access_guest_idle_reg = 1; +} + +/* + * Default Microsoft Hypervisor behavior for unimplemented MSR is to send a + * fault to the guest if it tries to access it. It is possible to override + * this behavior with a more suitable option i.e., ignore writes from the guest + * and return zero in attempt to read unimplemented. + */ +static int set_unimplemented_msr_action(int vm_fd) +{ + struct hv_input_set_partition_property in = {0}; + struct mshv_root_hvcall args = {0}; + + in.property_code = HV_PARTITION_PROPERTY_UNIMPLEMENTED_MSR_ACTION; + in.property_value = HV_UNIMPLEMENTED_MSR_ACTION_IGNORE_WRITE_READ_ZERO; + + args.code = HVCALL_SET_PARTITION_PROPERTY; + args.in_sz = sizeof(in); + args.in_ptr = (uint64_t)∈ + + trace_mshv_hvcall_args("unimplemented_msr_action", args.code, args.in_sz); + + int ret = mshv_hvcall(vm_fd, &args); + if (ret < 0) { + error_report("Failed to set unimplemented MSR action"); + return -1; + } + return 0; +} + +int mshv_arch_post_init_vm(int vm_fd) +{ + int ret; + + ret = set_unimplemented_msr_action(vm_fd); + if (ret < 0) { + error_report("Failed to set unimplemented MSR action"); + } + + return ret; +} From 4dc5d4257259764b6fcd035870517fc4140d8962 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:30 +0200 Subject: [PATCH 16/35] accel/mshv: Add vCPU creation and execution loop Create MSHV vCPUs using MSHV_CREATE_VP and initialize their state. Register the MSHV CPU execution loop loop with the QEMU accelerator framework to enable guest code execution. The target/i386 functionality is still mostly stubbed out and will be populated in a later commit in this series. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-11-magnuskulke@linux.microsoft.com [Fix g_free/g_clear_pointer confusion; rename qemu_wait_io_event; mshv.h/mshv_int.h split. - Paolo] Signed-off-by: Paolo Bonzini --- accel/mshv/mshv-all.c | 186 +++++++++++++++++++++++++++++++++--- accel/mshv/trace-events | 2 + include/system/mshv.h | 2 +- include/system/mshv_int.h | 20 ++++ target/i386/mshv/mshv-cpu.c | 63 ++++++++++++ 5 files changed, 260 insertions(+), 13 deletions(-) diff --git a/accel/mshv/mshv-all.c b/accel/mshv/mshv-all.c index 653195c57c..e02421d79d 100644 --- a/accel/mshv/mshv-all.c +++ b/accel/mshv/mshv-all.c @@ -393,6 +393,24 @@ int mshv_hvcall(int fd, const struct mshv_root_hvcall *args) return ret; } +static int mshv_init_vcpu(CPUState *cpu) +{ + int vm_fd = mshv_state->vm; + uint8_t vp_index = cpu->cpu_index; + int ret; + + mshv_arch_init_vcpu(cpu); + cpu->accel = g_new0(AccelCPUState, 1); + + ret = mshv_create_vcpu(vm_fd, vp_index, &cpu->accel->cpufd); + if (ret < 0) { + return -1; + } + + cpu->accel->dirty = true; + + return 0; +} static int mshv_init(AccelState *as, MachineState *ms) { @@ -415,6 +433,8 @@ static int mshv_init(AccelState *as, MachineState *ms) return -1; } + mshv_init_mmio_emu(); + mshv_init_msicontrol(); ret = create_vm(mshv_fd, &vm_fd); @@ -444,40 +464,182 @@ static int mshv_init(AccelState *as, MachineState *ms) return 0; } +static int mshv_destroy_vcpu(CPUState *cpu) +{ + int cpu_fd = mshv_vcpufd(cpu); + int vm_fd = mshv_state->vm; + + mshv_remove_vcpu(vm_fd, cpu_fd); + mshv_vcpufd(cpu) = 0; + + mshv_arch_destroy_vcpu(cpu); + g_clear_pointer(&cpu->accel, g_free); + return 0; +} + +static int mshv_cpu_exec(CPUState *cpu) +{ + hv_message mshv_msg; + enum MshvVmExit exit_reason; + int ret = 0; + + bql_unlock(); + cpu_exec_start(cpu); + + do { + if (cpu->accel->dirty) { + ret = mshv_arch_put_registers(cpu); + if (ret) { + error_report("Failed to put registers after init: %s", + strerror(-ret)); + ret = -1; + break; + } + cpu->accel->dirty = false; + } + + ret = mshv_run_vcpu(mshv_state->vm, cpu, &mshv_msg, &exit_reason); + if (ret < 0) { + error_report("Failed to run on vcpu %d", cpu->cpu_index); + abort(); + } + + switch (exit_reason) { + case MshvVmExitIgnore: + break; + default: + ret = EXCP_INTERRUPT; + break; + } + } while (ret == 0); + + cpu_exec_end(cpu); + bql_lock(); + + if (ret < 0) { + cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); + vm_stop(RUN_STATE_INTERNAL_ERROR); + } + + return ret; +} + +static void *mshv_vcpu_thread(void *arg) +{ + CPUState *cpu = arg; + int ret; + + rcu_register_thread(); + + bql_lock(); + qemu_thread_get_self(cpu->thread); + cpu->thread_id = qemu_get_thread_id(); + current_cpu = cpu; + ret = mshv_init_vcpu(cpu); + if (ret < 0) { + error_report("Failed to init vcpu %d", cpu->cpu_index); + goto cleanup; + } + + /* signal CPU creation */ + cpu_thread_signal_created(cpu); + qemu_guest_random_seed_thread_part2(cpu->random_seed); + + do { + qemu_process_cpu_events(cpu); + if (cpu_can_run(cpu)) { + mshv_cpu_exec(cpu); + } + } while (!cpu->unplug || cpu_can_run(cpu)); + + mshv_destroy_vcpu(cpu); +cleanup: + cpu_thread_signal_destroyed(cpu); + bql_unlock(); + rcu_unregister_thread(); + return NULL; +} + static void mshv_start_vcpu_thread(CPUState *cpu) { - error_report("unimplemented"); - abort(); + char thread_name[VCPU_THREAD_NAME_SIZE]; + + cpu->thread = g_malloc0(sizeof(QemuThread)); + cpu->halt_cond = g_malloc0(sizeof(QemuCond)); + + qemu_cond_init(cpu->halt_cond); + + trace_mshv_start_vcpu_thread(thread_name, cpu->cpu_index); + qemu_thread_create(cpu->thread, thread_name, mshv_vcpu_thread, cpu, + QEMU_THREAD_JOINABLE); +} + +static void do_mshv_cpu_synchronize_post_init(CPUState *cpu, + run_on_cpu_data arg) +{ + int ret = mshv_arch_put_registers(cpu); + if (ret < 0) { + error_report("Failed to put registers after init: %s", strerror(-ret)); + abort(); + } + + cpu->accel->dirty = false; } static void mshv_cpu_synchronize_post_init(CPUState *cpu) { - error_report("unimplemented"); - abort(); + run_on_cpu(cpu, do_mshv_cpu_synchronize_post_init, RUN_ON_CPU_NULL); } static void mshv_cpu_synchronize_post_reset(CPUState *cpu) { - error_report("unimplemented"); - abort(); + int ret = mshv_arch_put_registers(cpu); + if (ret) { + error_report("Failed to put registers after reset: %s", + strerror(-ret)); + cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); + vm_stop(RUN_STATE_INTERNAL_ERROR); + } + cpu->accel->dirty = false; +} + +static void do_mshv_cpu_synchronize_pre_loadvm(CPUState *cpu, + run_on_cpu_data arg) +{ + cpu->accel->dirty = true; } static void mshv_cpu_synchronize_pre_loadvm(CPUState *cpu) { - error_report("unimplemented"); - abort(); + run_on_cpu(cpu, do_mshv_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); +} + +static void do_mshv_cpu_synchronize(CPUState *cpu, run_on_cpu_data arg) +{ + if (!cpu->accel->dirty) { + int ret = mshv_load_regs(cpu); + if (ret < 0) { + error_report("Failed to load registers for vcpu %d", + cpu->cpu_index); + + cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); + vm_stop(RUN_STATE_INTERNAL_ERROR); + } + + cpu->accel->dirty = true; + } } static void mshv_cpu_synchronize(CPUState *cpu) { - error_report("unimplemented"); - abort(); + if (!cpu->accel->dirty) { + run_on_cpu(cpu, do_mshv_cpu_synchronize, RUN_ON_CPU_NULL); + } } static bool mshv_cpus_are_resettable(void) { - error_report("unimplemented"); - abort(); + return false; } static void mshv_accel_class_init(ObjectClass *oc, const void *data) diff --git a/accel/mshv/trace-events b/accel/mshv/trace-events index 6130c4abf8..a4dffeb24a 100644 --- a/accel/mshv/trace-events +++ b/accel/mshv/trace-events @@ -3,6 +3,8 @@ # # SPDX-License-Identifier: GPL-2.0-or-later +mshv_start_vcpu_thread(const char* thread, uint32_t cpu) "thread=%s cpu_index=%d" + mshv_set_memory(bool add, uint64_t gpa, uint64_t size, uint64_t user_addr, bool readonly, int ret) "add=%d gpa=0x%" PRIx64 " size=0x%" PRIx64 " user=0x%" PRIx64 " readonly=%d result=%d" mshv_mem_ioeventfd_add(uint64_t addr, uint32_t size, uint32_t data) "addr=0x%" PRIx64 " size=%d data=0x%x" mshv_mem_ioeventfd_del(uint64_t addr, uint32_t size, uint32_t data) "addr=0x%" PRIx64 " size=%d data=0x%x" diff --git a/include/system/mshv.h b/include/system/mshv.h index 1011e81df4..bbc42f4dc3 100644 --- a/include/system/mshv.h +++ b/include/system/mshv.h @@ -41,7 +41,7 @@ extern bool mshv_allowed; #define mshv_msi_via_irqfd_enabled() mshv_enabled() #else /* CONFIG_MSHV_IS_POSSIBLE */ #define mshv_enabled() false -#define mshv_msi_via_irqfd_enabled() false +#define mshv_msi_via_irqfd_enabled() mshv_enabled() #endif typedef struct MshvState MshvState; diff --git a/include/system/mshv_int.h b/include/system/mshv_int.h index b36124a0ea..fb80f69772 100644 --- a/include/system/mshv_int.h +++ b/include/system/mshv_int.h @@ -14,6 +14,8 @@ #ifndef QEMU_MSHV_INT_H #define QEMU_MSHV_INT_H +typedef struct hyperv_message hv_message; + struct AccelCPUState { int cpufd; bool dirty; @@ -44,6 +46,24 @@ typedef struct MshvMsiControl { GHashTable *gsi_routes; } MshvMsiControl; +#define mshv_vcpufd(cpu) (cpu->accel->cpufd) + +/* cpu */ +typedef enum MshvVmExit { + MshvVmExitIgnore = 0, + MshvVmExitShutdown = 1, + MshvVmExitSpecial = 2, +} MshvVmExit; + +void mshv_init_mmio_emu(void); +int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd); +void mshv_remove_vcpu(int vm_fd, int cpu_fd); +int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit); +int mshv_load_regs(CPUState *cpu); +int mshv_store_regs(CPUState *cpu); +int mshv_arch_put_registers(const CPUState *cpu); +void mshv_arch_init_vcpu(CPUState *cpu); +void mshv_arch_destroy_vcpu(CPUState *cpu); void mshv_arch_amend_proc_features( union hv_partition_synthetic_processor_features *features); int mshv_arch_post_init_vm(int vm_fd); diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c index de0c26bc6c..02d71ebc14 100644 --- a/target/i386/mshv/mshv-cpu.c +++ b/target/i386/mshv/mshv-cpu.c @@ -22,15 +22,78 @@ #include "hw/hyperv/hvgdk_mini.h" #include "hw/hyperv/hvhdk_mini.h" +#include "cpu.h" +#include "emulate/x86_decode.h" +#include "emulate/x86_emu.h" +#include "emulate/x86_flags.h" + #include "trace-accel_mshv.h" #include "trace.h" +int mshv_store_regs(CPUState *cpu) +{ + error_report("unimplemented"); + abort(); +} + +int mshv_load_regs(CPUState *cpu) +{ + error_report("unimplemented"); + abort(); +} + +int mshv_arch_put_registers(const CPUState *cpu) +{ + error_report("unimplemented"); + abort(); +} + void mshv_arch_amend_proc_features( union hv_partition_synthetic_processor_features *features) { features->access_guest_idle_reg = 1; } +int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit) +{ + error_report("unimplemented"); + abort(); +} + +void mshv_remove_vcpu(int vm_fd, int cpu_fd) +{ + error_report("unimplemented"); + abort(); +} + +int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd) +{ + error_report("unimplemented"); + abort(); +} + +void mshv_init_mmio_emu(void) +{ + error_report("unimplemented"); + abort(); +} + +void mshv_arch_init_vcpu(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + + env->emu_mmio_buf = g_new(char, 4096); +} + +void mshv_arch_destroy_vcpu(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + + g_clear_pointer(&env->emu_mmio_buf, g_free); +} + /* * Default Microsoft Hypervisor behavior for unimplemented MSR is to send a * fault to the guest if it tries to access it. It is possible to override From 575df4df54e060661db45e6f80293b29ea8901c1 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:31 +0200 Subject: [PATCH 17/35] accel/mshv: Add vCPU signal handling Implement signal handling for MSHV vCPUs to support asynchronous interrupts from the main thread. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-12-magnuskulke@linux.microsoft.com Signed-off-by: Paolo Bonzini --- accel/mshv/mshv-all.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/accel/mshv/mshv-all.c b/accel/mshv/mshv-all.c index e02421d79d..fa1f8f35bd 100644 --- a/accel/mshv/mshv-all.c +++ b/accel/mshv/mshv-all.c @@ -524,6 +524,35 @@ static int mshv_cpu_exec(CPUState *cpu) return ret; } +/* + * The signal handler is triggered when QEMU's main thread receives a SIG_IPI + * (SIGUSR1). This signal causes the current CPU thread to be kicked, forcing a + * VM exit on the CPU. The VM exit generates an exit reason that breaks the loop + * (see mshv_cpu_exec). If the exit is due to a Ctrl+A+x command, the system + * will shut down. For other cases, the system will continue running. + */ +static void sa_ipi_handler(int sig) +{ + /* TODO: call IOCTL to set_immediate_exit, once implemented. */ + + qemu_cpu_kick_self(); +} + +static void init_signal(CPUState *cpu) +{ + /* init cpu signals */ + struct sigaction sigact; + sigset_t set; + + memset(&sigact, 0, sizeof(sigact)); + sigact.sa_handler = sa_ipi_handler; + sigaction(SIG_IPI, &sigact, NULL); + + pthread_sigmask(SIG_BLOCK, NULL, &set); + sigdelset(&set, SIG_IPI); + pthread_sigmask(SIG_SETMASK, &set, NULL); +} + static void *mshv_vcpu_thread(void *arg) { CPUState *cpu = arg; @@ -540,6 +569,7 @@ static void *mshv_vcpu_thread(void *arg) error_report("Failed to init vcpu %d", cpu->cpu_index); goto cleanup; } + init_signal(cpu); /* signal CPU creation */ cpu_thread_signal_created(cpu); From 4ed605c06e743b2ec0bfe35954d88b8c64dd3767 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:32 +0200 Subject: [PATCH 18/35] target/i386/mshv: Add CPU create and remove logic Implement MSHV-specific hooks for vCPU creation and teardown in the i386 target. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-13-magnuskulke@linux.microsoft.com Signed-off-by: Paolo Bonzini --- target/i386/mshv/mshv-cpu.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c index 02d71ebc14..5069ab7a22 100644 --- a/target/i386/mshv/mshv-cpu.c +++ b/target/i386/mshv/mshv-cpu.c @@ -30,6 +30,8 @@ #include "trace-accel_mshv.h" #include "trace.h" +#include + int mshv_store_regs(CPUState *cpu) { error_report("unimplemented"); @@ -62,20 +64,29 @@ int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit) void mshv_remove_vcpu(int vm_fd, int cpu_fd) { - error_report("unimplemented"); - abort(); + close(cpu_fd); } + int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd) { - error_report("unimplemented"); - abort(); + int ret; + struct mshv_create_vp vp_arg = { + .vp_index = vp_index, + }; + ret = ioctl(vm_fd, MSHV_CREATE_VP, &vp_arg); + if (ret < 0) { + error_report("failed to create mshv vcpu: %s", strerror(errno)); + return -1; + } + + *cpu_fd = ret; + + return 0; } void mshv_init_mmio_emu(void) { - error_report("unimplemented"); - abort(); } void mshv_arch_init_vcpu(CPUState *cpu) From 2bd7a6aa4743ee85c77c0520e8b422b3bfeab386 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Thu, 2 Oct 2025 18:13:31 +0200 Subject: [PATCH 19/35] target/i386/mshv: Implement mshv_store_regs() Add support for writing general-purpose registers to MSHV vCPUs during initialization or migration using the MSHV register interface. A generic set_register call is introduced to abstract the HV call over the various register types. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-14-magnuskulke@linux.microsoft.com [mshv.h/mshv_int.h split. - Paolo] Signed-off-by: Paolo Bonzini --- include/system/mshv.h | 1 + include/system/mshv_int.h | 2 + target/i386/mshv/mshv-cpu.c | 116 +++++++++++++++++++++++++++++++++++- 3 files changed, 117 insertions(+), 2 deletions(-) diff --git a/include/system/mshv.h b/include/system/mshv.h index bbc42f4dc3..8b1fc20c80 100644 --- a/include/system/mshv.h +++ b/include/system/mshv.h @@ -18,6 +18,7 @@ #include "qemu/accel.h" #include "hw/hyperv/hyperv-proto.h" #include "hw/hyperv/hvhdk.h" +#include "hw/hyperv/hvgdk_mini.h" #include "qapi/qapi-types-common.h" #include "system/memory.h" #include "accel/accel-ops.h" diff --git a/include/system/mshv_int.h b/include/system/mshv_int.h index fb80f69772..731841af92 100644 --- a/include/system/mshv_int.h +++ b/include/system/mshv_int.h @@ -61,6 +61,8 @@ void mshv_remove_vcpu(int vm_fd, int cpu_fd); int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit); int mshv_load_regs(CPUState *cpu); int mshv_store_regs(CPUState *cpu); +int mshv_set_generic_regs(const CPUState *cpu, const hv_register_assoc *assocs, + size_t n_regs); int mshv_arch_put_registers(const CPUState *cpu); void mshv_arch_init_vcpu(CPUState *cpu); void mshv_arch_destroy_vcpu(CPUState *cpu); diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c index 5069ab7a22..9ead03ca2d 100644 --- a/target/i386/mshv/mshv-cpu.c +++ b/target/i386/mshv/mshv-cpu.c @@ -32,12 +32,124 @@ #include +static enum hv_register_name STANDARD_REGISTER_NAMES[18] = { + HV_X64_REGISTER_RAX, + HV_X64_REGISTER_RBX, + HV_X64_REGISTER_RCX, + HV_X64_REGISTER_RDX, + HV_X64_REGISTER_RSI, + HV_X64_REGISTER_RDI, + HV_X64_REGISTER_RSP, + HV_X64_REGISTER_RBP, + HV_X64_REGISTER_R8, + HV_X64_REGISTER_R9, + HV_X64_REGISTER_R10, + HV_X64_REGISTER_R11, + HV_X64_REGISTER_R12, + HV_X64_REGISTER_R13, + HV_X64_REGISTER_R14, + HV_X64_REGISTER_R15, + HV_X64_REGISTER_RIP, + HV_X64_REGISTER_RFLAGS, +}; + +int mshv_set_generic_regs(const CPUState *cpu, const hv_register_assoc *assocs, + size_t n_regs) +{ + int cpu_fd = mshv_vcpufd(cpu); + int vp_index = cpu->cpu_index; + size_t in_sz, assocs_sz; + hv_input_set_vp_registers *in; + struct mshv_root_hvcall args = {0}; + int ret; + + /* find out the size of the struct w/ a flexible array at the tail */ + assocs_sz = n_regs * sizeof(hv_register_assoc); + in_sz = sizeof(hv_input_set_vp_registers) + assocs_sz; + + /* fill the input struct */ + in = g_malloc0(in_sz); + in->vp_index = vp_index; + memcpy(in->elements, assocs, assocs_sz); + + /* create the hvcall envelope */ + args.code = HVCALL_SET_VP_REGISTERS; + args.in_sz = in_sz; + args.in_ptr = (uint64_t) in; + args.reps = (uint16_t) n_regs; + + /* perform the call */ + ret = mshv_hvcall(cpu_fd, &args); + g_free(in); + if (ret < 0) { + error_report("Failed to set registers"); + return -1; + } + + /* assert we set all registers */ + if (args.reps != n_regs) { + error_report("Failed to set registers: expected %zu elements" + ", got %u", n_regs, args.reps); + return -1; + } + + return 0; +} + +static int set_standard_regs(const CPUState *cpu) +{ + X86CPU *x86cpu = X86_CPU(cpu); + CPUX86State *env = &x86cpu->env; + hv_register_assoc assocs[ARRAY_SIZE(STANDARD_REGISTER_NAMES)]; + int ret; + size_t n_regs = ARRAY_SIZE(STANDARD_REGISTER_NAMES); + + /* set names */ + for (size_t i = 0; i < ARRAY_SIZE(STANDARD_REGISTER_NAMES); i++) { + assocs[i].name = STANDARD_REGISTER_NAMES[i]; + } + assocs[0].value.reg64 = env->regs[R_EAX]; + assocs[1].value.reg64 = env->regs[R_EBX]; + assocs[2].value.reg64 = env->regs[R_ECX]; + assocs[3].value.reg64 = env->regs[R_EDX]; + assocs[4].value.reg64 = env->regs[R_ESI]; + assocs[5].value.reg64 = env->regs[R_EDI]; + assocs[6].value.reg64 = env->regs[R_ESP]; + assocs[7].value.reg64 = env->regs[R_EBP]; + assocs[8].value.reg64 = env->regs[R_R8]; + assocs[9].value.reg64 = env->regs[R_R9]; + assocs[10].value.reg64 = env->regs[R_R10]; + assocs[11].value.reg64 = env->regs[R_R11]; + assocs[12].value.reg64 = env->regs[R_R12]; + assocs[13].value.reg64 = env->regs[R_R13]; + assocs[14].value.reg64 = env->regs[R_R14]; + assocs[15].value.reg64 = env->regs[R_R15]; + assocs[16].value.reg64 = env->eip; + lflags_to_rflags(env); + assocs[17].value.reg64 = env->eflags; + + ret = mshv_set_generic_regs(cpu, assocs, n_regs); + if (ret < 0) { + error_report("failed to set standard registers"); + return -errno; + } + return 0; +} + int mshv_store_regs(CPUState *cpu) { - error_report("unimplemented"); - abort(); + int ret; + + ret = set_standard_regs(cpu); + if (ret < 0) { + error_report("Failed to store standard registers"); + return -1; + } + + return 0; } + int mshv_load_regs(CPUState *cpu) { error_report("unimplemented"); From 66480a048a01fc833ba153410282609f26166141 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:34 +0200 Subject: [PATCH 20/35] target/i386/mshv: Implement mshv_get_standard_regs() Fetch standard register state from MSHV vCPUs to support debugging, migration, and other introspection features in QEMU. Fetch standard register state from a MHSV vCPU's. A generic get_regs() function and a mapper to map the different register representations are introduced. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-15-magnuskulke@linux.microsoft.com [mshv.h/mshv_int.h split. - Paolo] Signed-off-by: Paolo Bonzini --- include/system/mshv_int.h | 1 + target/i386/mshv/mshv-cpu.c | 116 +++++++++++++++++++++++++++++++++++- 2 files changed, 115 insertions(+), 2 deletions(-) diff --git a/include/system/mshv_int.h b/include/system/mshv_int.h index 731841af92..b0a79296ad 100644 --- a/include/system/mshv_int.h +++ b/include/system/mshv_int.h @@ -58,6 +58,7 @@ typedef enum MshvVmExit { void mshv_init_mmio_emu(void); int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd); void mshv_remove_vcpu(int vm_fd, int cpu_fd); +int mshv_get_standard_regs(CPUState *cpu); int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit); int mshv_load_regs(CPUState *cpu); int mshv_store_regs(CPUState *cpu); diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c index 9ead03ca2d..4e3eee113b 100644 --- a/target/i386/mshv/mshv-cpu.c +++ b/target/i386/mshv/mshv-cpu.c @@ -96,6 +96,66 @@ int mshv_set_generic_regs(const CPUState *cpu, const hv_register_assoc *assocs, return 0; } +static int get_generic_regs(CPUState *cpu, hv_register_assoc *assocs, + size_t n_regs) +{ + int cpu_fd = mshv_vcpufd(cpu); + int vp_index = cpu->cpu_index; + hv_input_get_vp_registers *in; + hv_register_value *values; + size_t in_sz, names_sz, values_sz; + int i, ret; + struct mshv_root_hvcall args = {0}; + + /* find out the size of the struct w/ a flexible array at the tail */ + names_sz = n_regs * sizeof(hv_register_name); + in_sz = sizeof(hv_input_get_vp_registers) + names_sz; + + /* fill the input struct */ + in = g_malloc0(in_sz); + in->vp_index = vp_index; + for (i = 0; i < n_regs; i++) { + in->names[i] = assocs[i].name; + } + + /* allocate value output buffer */ + values_sz = n_regs * sizeof(union hv_register_value); + values = g_malloc0(values_sz); + + /* create the hvcall envelope */ + args.code = HVCALL_GET_VP_REGISTERS; + args.in_sz = in_sz; + args.in_ptr = (uint64_t) in; + args.out_sz = values_sz; + args.out_ptr = (uint64_t) values; + args.reps = (uint16_t) n_regs; + + /* perform the call */ + ret = mshv_hvcall(cpu_fd, &args); + g_free(in); + if (ret < 0) { + g_free(values); + error_report("Failed to retrieve registers"); + return -1; + } + + /* assert we got all registers */ + if (args.reps != n_regs) { + g_free(values); + error_report("Failed to retrieve registers: expected %zu elements" + ", got %u", n_regs, args.reps); + return -1; + } + + /* copy values into assoc */ + for (i = 0; i < n_regs; i++) { + assocs[i].value = values[i]; + } + g_free(values); + + return 0; +} + static int set_standard_regs(const CPUState *cpu) { X86CPU *x86cpu = X86_CPU(cpu); @@ -149,11 +209,63 @@ int mshv_store_regs(CPUState *cpu) return 0; } +static void populate_standard_regs(const hv_register_assoc *assocs, + CPUX86State *env) +{ + env->regs[R_EAX] = assocs[0].value.reg64; + env->regs[R_EBX] = assocs[1].value.reg64; + env->regs[R_ECX] = assocs[2].value.reg64; + env->regs[R_EDX] = assocs[3].value.reg64; + env->regs[R_ESI] = assocs[4].value.reg64; + env->regs[R_EDI] = assocs[5].value.reg64; + env->regs[R_ESP] = assocs[6].value.reg64; + env->regs[R_EBP] = assocs[7].value.reg64; + env->regs[R_R8] = assocs[8].value.reg64; + env->regs[R_R9] = assocs[9].value.reg64; + env->regs[R_R10] = assocs[10].value.reg64; + env->regs[R_R11] = assocs[11].value.reg64; + env->regs[R_R12] = assocs[12].value.reg64; + env->regs[R_R13] = assocs[13].value.reg64; + env->regs[R_R14] = assocs[14].value.reg64; + env->regs[R_R15] = assocs[15].value.reg64; + + env->eip = assocs[16].value.reg64; + env->eflags = assocs[17].value.reg64; + rflags_to_lflags(env); +} + +int mshv_get_standard_regs(CPUState *cpu) +{ + struct hv_register_assoc assocs[ARRAY_SIZE(STANDARD_REGISTER_NAMES)]; + int ret; + X86CPU *x86cpu = X86_CPU(cpu); + CPUX86State *env = &x86cpu->env; + size_t n_regs = ARRAY_SIZE(STANDARD_REGISTER_NAMES); + + for (size_t i = 0; i < n_regs; i++) { + assocs[i].name = STANDARD_REGISTER_NAMES[i]; + } + ret = get_generic_regs(cpu, assocs, n_regs); + if (ret < 0) { + error_report("failed to get standard registers"); + return -1; + } + + populate_standard_regs(assocs, env); + return 0; +} int mshv_load_regs(CPUState *cpu) { - error_report("unimplemented"); - abort(); + int ret; + + ret = mshv_get_standard_regs(cpu); + if (ret < 0) { + error_report("Failed to load standard registers"); + return -1; + } + + return 0; } int mshv_arch_put_registers(const CPUState *cpu) From 0382c2c8544da66619d16988caaab8e3d9e881d4 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:35 +0200 Subject: [PATCH 21/35] target/i386/mshv: Implement mshv_get_special_regs() Retrieve special registers (e.g. segment, control, and descriptor table registers) from MSHV vCPUs. Various helper functions to map register state representations between Qemu and MSHV are introduced. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-16-magnuskulke@linux.microsoft.com [mshv.h/mshv_int.h split. - Paolo] Signed-off-by: Paolo Bonzini --- include/system/mshv_int.h | 1 + target/i386/mshv/mshv-cpu.c | 104 ++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) diff --git a/include/system/mshv_int.h b/include/system/mshv_int.h index b0a79296ad..c6e6e8af30 100644 --- a/include/system/mshv_int.h +++ b/include/system/mshv_int.h @@ -59,6 +59,7 @@ void mshv_init_mmio_emu(void); int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd); void mshv_remove_vcpu(int vm_fd, int cpu_fd); int mshv_get_standard_regs(CPUState *cpu); +int mshv_get_special_regs(CPUState *cpu); int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit); int mshv_load_regs(CPUState *cpu); int mshv_store_regs(CPUState *cpu); diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c index 4e3eee113b..bc75686f82 100644 --- a/target/i386/mshv/mshv-cpu.c +++ b/target/i386/mshv/mshv-cpu.c @@ -53,6 +53,26 @@ static enum hv_register_name STANDARD_REGISTER_NAMES[18] = { HV_X64_REGISTER_RFLAGS, }; +static enum hv_register_name SPECIAL_REGISTER_NAMES[17] = { + HV_X64_REGISTER_CS, + HV_X64_REGISTER_DS, + HV_X64_REGISTER_ES, + HV_X64_REGISTER_FS, + HV_X64_REGISTER_GS, + HV_X64_REGISTER_SS, + HV_X64_REGISTER_TR, + HV_X64_REGISTER_LDTR, + HV_X64_REGISTER_GDTR, + HV_X64_REGISTER_IDTR, + HV_X64_REGISTER_CR0, + HV_X64_REGISTER_CR2, + HV_X64_REGISTER_CR3, + HV_X64_REGISTER_CR4, + HV_X64_REGISTER_CR8, + HV_X64_REGISTER_EFER, + HV_X64_REGISTER_APIC_BASE, +}; + int mshv_set_generic_regs(const CPUState *cpu, const hv_register_assoc *assocs, size_t n_regs) { @@ -255,6 +275,84 @@ int mshv_get_standard_regs(CPUState *cpu) return 0; } +static inline void populate_segment_reg(const hv_x64_segment_register *hv_seg, + SegmentCache *seg) +{ + memset(seg, 0, sizeof(SegmentCache)); + + seg->base = hv_seg->base; + seg->limit = hv_seg->limit; + seg->selector = hv_seg->selector; + + seg->flags = (hv_seg->segment_type << DESC_TYPE_SHIFT) + | (hv_seg->present * DESC_P_MASK) + | (hv_seg->descriptor_privilege_level << DESC_DPL_SHIFT) + | (hv_seg->_default << DESC_B_SHIFT) + | (hv_seg->non_system_segment * DESC_S_MASK) + | (hv_seg->_long << DESC_L_SHIFT) + | (hv_seg->granularity * DESC_G_MASK) + | (hv_seg->available * DESC_AVL_MASK); + +} + +static inline void populate_table_reg(const hv_x64_table_register *hv_seg, + SegmentCache *tbl) +{ + memset(tbl, 0, sizeof(SegmentCache)); + + tbl->base = hv_seg->base; + tbl->limit = hv_seg->limit; +} + +static void populate_special_regs(const hv_register_assoc *assocs, + X86CPU *x86cpu) +{ + CPUX86State *env = &x86cpu->env; + + populate_segment_reg(&assocs[0].value.segment, &env->segs[R_CS]); + populate_segment_reg(&assocs[1].value.segment, &env->segs[R_DS]); + populate_segment_reg(&assocs[2].value.segment, &env->segs[R_ES]); + populate_segment_reg(&assocs[3].value.segment, &env->segs[R_FS]); + populate_segment_reg(&assocs[4].value.segment, &env->segs[R_GS]); + populate_segment_reg(&assocs[5].value.segment, &env->segs[R_SS]); + + populate_segment_reg(&assocs[6].value.segment, &env->tr); + populate_segment_reg(&assocs[7].value.segment, &env->ldt); + + populate_table_reg(&assocs[8].value.table, &env->gdt); + populate_table_reg(&assocs[9].value.table, &env->idt); + + env->cr[0] = assocs[10].value.reg64; + env->cr[2] = assocs[11].value.reg64; + env->cr[3] = assocs[12].value.reg64; + env->cr[4] = assocs[13].value.reg64; + + cpu_set_apic_tpr(x86cpu->apic_state, assocs[14].value.reg64); + env->efer = assocs[15].value.reg64; + cpu_set_apic_base(x86cpu->apic_state, assocs[16].value.reg64); +} + + +int mshv_get_special_regs(CPUState *cpu) +{ + struct hv_register_assoc assocs[ARRAY_SIZE(SPECIAL_REGISTER_NAMES)]; + int ret; + X86CPU *x86cpu = X86_CPU(cpu); + size_t n_regs = ARRAY_SIZE(SPECIAL_REGISTER_NAMES); + + for (size_t i = 0; i < n_regs; i++) { + assocs[i].name = SPECIAL_REGISTER_NAMES[i]; + } + ret = get_generic_regs(cpu, assocs, n_regs); + if (ret < 0) { + error_report("failed to get special registers"); + return -errno; + } + + populate_special_regs(assocs, x86cpu); + return 0; +} + int mshv_load_regs(CPUState *cpu) { int ret; @@ -265,6 +363,12 @@ int mshv_load_regs(CPUState *cpu) return -1; } + ret = mshv_get_special_regs(cpu); + if (ret < 0) { + error_report("Failed to load special registers"); + return -1; + } + return 0; } From 25a1d871e0f19bbf8fee471af5fefec52465bb09 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:36 +0200 Subject: [PATCH 22/35] target/i386/mshv: Implement mshv_arch_put_registers() Write CPU register state to MSHV vCPUs. Various mapping functions to prepare the payload for the HV call have been implemented. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-17-magnuskulke@linux.microsoft.com [mshv.h/mshv_int.h split. - Paolo] Signed-off-by: Paolo Bonzini --- include/system/mshv_int.h | 15 +++ target/i386/mshv/mshv-cpu.c | 237 ++++++++++++++++++++++++++++++++++++ 2 files changed, 252 insertions(+) diff --git a/include/system/mshv_int.h b/include/system/mshv_int.h index c6e6e8af30..0ea8d504fa 100644 --- a/include/system/mshv_int.h +++ b/include/system/mshv_int.h @@ -49,6 +49,20 @@ typedef struct MshvMsiControl { #define mshv_vcpufd(cpu) (cpu->accel->cpufd) /* cpu */ +typedef struct MshvFPU { + uint8_t fpr[8][16]; + uint16_t fcw; + uint16_t fsw; + uint8_t ftwx; + uint8_t pad1; + uint16_t last_opcode; + uint64_t last_ip; + uint64_t last_dp; + uint8_t xmm[16][16]; + uint32_t mxcsr; + uint32_t pad2; +} MshvFPU; + typedef enum MshvVmExit { MshvVmExitIgnore = 0, MshvVmExitShutdown = 1, @@ -58,6 +72,7 @@ typedef enum MshvVmExit { void mshv_init_mmio_emu(void); int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd); void mshv_remove_vcpu(int vm_fd, int cpu_fd); +int mshv_configure_vcpu(const CPUState *cpu, const MshvFPU *fpu, uint64_t xcr0); int mshv_get_standard_regs(CPUState *cpu); int mshv_get_special_regs(CPUState *cpu); int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit); diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c index bc75686f82..8b10c79e54 100644 --- a/target/i386/mshv/mshv-cpu.c +++ b/target/i386/mshv/mshv-cpu.c @@ -73,6 +73,35 @@ static enum hv_register_name SPECIAL_REGISTER_NAMES[17] = { HV_X64_REGISTER_APIC_BASE, }; +static enum hv_register_name FPU_REGISTER_NAMES[26] = { + HV_X64_REGISTER_XMM0, + HV_X64_REGISTER_XMM1, + HV_X64_REGISTER_XMM2, + HV_X64_REGISTER_XMM3, + HV_X64_REGISTER_XMM4, + HV_X64_REGISTER_XMM5, + HV_X64_REGISTER_XMM6, + HV_X64_REGISTER_XMM7, + HV_X64_REGISTER_XMM8, + HV_X64_REGISTER_XMM9, + HV_X64_REGISTER_XMM10, + HV_X64_REGISTER_XMM11, + HV_X64_REGISTER_XMM12, + HV_X64_REGISTER_XMM13, + HV_X64_REGISTER_XMM14, + HV_X64_REGISTER_XMM15, + HV_X64_REGISTER_FP_MMX0, + HV_X64_REGISTER_FP_MMX1, + HV_X64_REGISTER_FP_MMX2, + HV_X64_REGISTER_FP_MMX3, + HV_X64_REGISTER_FP_MMX4, + HV_X64_REGISTER_FP_MMX5, + HV_X64_REGISTER_FP_MMX6, + HV_X64_REGISTER_FP_MMX7, + HV_X64_REGISTER_FP_CONTROL_STATUS, + HV_X64_REGISTER_XMM_CONTROL_STATUS, +}; + int mshv_set_generic_regs(const CPUState *cpu, const hv_register_assoc *assocs, size_t n_regs) { @@ -372,8 +401,216 @@ int mshv_load_regs(CPUState *cpu) return 0; } +static inline void populate_hv_segment_reg(SegmentCache *seg, + hv_x64_segment_register *hv_reg) +{ + uint32_t flags = seg->flags; + + hv_reg->base = seg->base; + hv_reg->limit = seg->limit; + hv_reg->selector = seg->selector; + hv_reg->segment_type = (flags >> DESC_TYPE_SHIFT) & 0xF; + hv_reg->non_system_segment = (flags & DESC_S_MASK) != 0; + hv_reg->descriptor_privilege_level = (flags >> DESC_DPL_SHIFT) & 0x3; + hv_reg->present = (flags & DESC_P_MASK) != 0; + hv_reg->reserved = 0; + hv_reg->available = (flags & DESC_AVL_MASK) != 0; + hv_reg->_long = (flags >> DESC_L_SHIFT) & 0x1; + hv_reg->_default = (flags >> DESC_B_SHIFT) & 0x1; + hv_reg->granularity = (flags & DESC_G_MASK) != 0; +} + +static inline void populate_hv_table_reg(const struct SegmentCache *seg, + hv_x64_table_register *hv_reg) +{ + memset(hv_reg, 0, sizeof(*hv_reg)); + + hv_reg->base = seg->base; + hv_reg->limit = seg->limit; +} + +static int set_special_regs(const CPUState *cpu) +{ + X86CPU *x86cpu = X86_CPU(cpu); + CPUX86State *env = &x86cpu->env; + struct hv_register_assoc assocs[ARRAY_SIZE(SPECIAL_REGISTER_NAMES)]; + size_t n_regs = ARRAY_SIZE(SPECIAL_REGISTER_NAMES); + int ret; + + /* set names */ + for (size_t i = 0; i < n_regs; i++) { + assocs[i].name = SPECIAL_REGISTER_NAMES[i]; + } + populate_hv_segment_reg(&env->segs[R_CS], &assocs[0].value.segment); + populate_hv_segment_reg(&env->segs[R_DS], &assocs[1].value.segment); + populate_hv_segment_reg(&env->segs[R_ES], &assocs[2].value.segment); + populate_hv_segment_reg(&env->segs[R_FS], &assocs[3].value.segment); + populate_hv_segment_reg(&env->segs[R_GS], &assocs[4].value.segment); + populate_hv_segment_reg(&env->segs[R_SS], &assocs[5].value.segment); + populate_hv_segment_reg(&env->tr, &assocs[6].value.segment); + populate_hv_segment_reg(&env->ldt, &assocs[7].value.segment); + + populate_hv_table_reg(&env->gdt, &assocs[8].value.table); + populate_hv_table_reg(&env->idt, &assocs[9].value.table); + + assocs[10].value.reg64 = env->cr[0]; + assocs[11].value.reg64 = env->cr[2]; + assocs[12].value.reg64 = env->cr[3]; + assocs[13].value.reg64 = env->cr[4]; + assocs[14].value.reg64 = cpu_get_apic_tpr(x86cpu->apic_state); + assocs[15].value.reg64 = env->efer; + assocs[16].value.reg64 = cpu_get_apic_base(x86cpu->apic_state); + + ret = mshv_set_generic_regs(cpu, assocs, n_regs); + if (ret < 0) { + error_report("failed to set special registers"); + return -1; + } + + return 0; +} + +static int set_fpu(const CPUState *cpu, const struct MshvFPU *regs) +{ + struct hv_register_assoc assocs[ARRAY_SIZE(FPU_REGISTER_NAMES)]; + union hv_register_value *value; + size_t fp_i; + union hv_x64_fp_control_status_register *ctrl_status; + union hv_x64_xmm_control_status_register *xmm_ctrl_status; + int ret; + size_t n_regs = ARRAY_SIZE(FPU_REGISTER_NAMES); + + /* first 16 registers are xmm0-xmm15 */ + for (size_t i = 0; i < 16; i++) { + assocs[i].name = FPU_REGISTER_NAMES[i]; + value = &assocs[i].value; + memcpy(&value->reg128, ®s->xmm[i], 16); + } + + /* next 8 registers are fp_mmx0-fp_mmx7 */ + for (size_t i = 16; i < 24; i++) { + assocs[i].name = FPU_REGISTER_NAMES[i]; + fp_i = (i - 16); + value = &assocs[i].value; + memcpy(&value->reg128, ®s->fpr[fp_i], 16); + } + + /* last two registers are fp_control_status and xmm_control_status */ + assocs[24].name = FPU_REGISTER_NAMES[24]; + value = &assocs[24].value; + ctrl_status = &value->fp_control_status; + ctrl_status->fp_control = regs->fcw; + ctrl_status->fp_status = regs->fsw; + ctrl_status->fp_tag = regs->ftwx; + ctrl_status->reserved = 0; + ctrl_status->last_fp_op = regs->last_opcode; + ctrl_status->last_fp_rip = regs->last_ip; + + assocs[25].name = FPU_REGISTER_NAMES[25]; + value = &assocs[25].value; + xmm_ctrl_status = &value->xmm_control_status; + xmm_ctrl_status->xmm_status_control = regs->mxcsr; + xmm_ctrl_status->xmm_status_control_mask = 0; + xmm_ctrl_status->last_fp_rdp = regs->last_dp; + + ret = mshv_set_generic_regs(cpu, assocs, n_regs); + if (ret < 0) { + error_report("failed to set fpu registers"); + return -1; + } + + return 0; +} + +static int set_xc_reg(const CPUState *cpu, uint64_t xcr0) +{ + int ret; + struct hv_register_assoc assoc = { + .name = HV_X64_REGISTER_XFEM, + .value.reg64 = xcr0, + }; + + ret = mshv_set_generic_regs(cpu, &assoc, 1); + if (ret < 0) { + error_report("failed to set xcr0"); + return -errno; + } + return 0; +} + +static int set_cpu_state(const CPUState *cpu, const MshvFPU *fpu_regs, + uint64_t xcr0) +{ + int ret; + + ret = set_standard_regs(cpu); + if (ret < 0) { + return ret; + } + ret = set_special_regs(cpu); + if (ret < 0) { + return ret; + } + ret = set_fpu(cpu, fpu_regs); + if (ret < 0) { + return ret; + } + ret = set_xc_reg(cpu, xcr0); + if (ret < 0) { + return ret; + } + return 0; +} + +/* + * TODO: populate topology info: + * + * X86CPU *x86cpu = X86_CPU(cpu); + * CPUX86State *env = &x86cpu->env; + * X86CPUTopoInfo *topo_info = &env->topo_info; + */ +int mshv_configure_vcpu(const CPUState *cpu, const struct MshvFPU *fpu, + uint64_t xcr0) +{ + int ret; + + ret = set_cpu_state(cpu, fpu, xcr0); + if (ret < 0) { + error_report("failed to set cpu state"); + return -1; + } + + return 0; +} + +static int put_regs(const CPUState *cpu) +{ + X86CPU *x86cpu = X86_CPU(cpu); + CPUX86State *env = &x86cpu->env; + MshvFPU fpu = {0}; + int ret; + + memset(&fpu, 0, sizeof(fpu)); + + ret = mshv_configure_vcpu(cpu, &fpu, env->xcr0); + if (ret < 0) { + error_report("failed to configure vcpu"); + return ret; + } + + return 0; +} + int mshv_arch_put_registers(const CPUState *cpu) { + int ret; + + ret = put_regs(cpu); + if (ret < 0) { + error_report("Failed to put registers"); + return -1; + } + error_report("unimplemented"); abort(); } From ca20d46fa94887a6e42061646a71ee44cbc9adb0 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:37 +0200 Subject: [PATCH 23/35] target/i386/mshv: Set local interrupt controller state To set the local interrupt controller state, perform hv calls retrieving partition state from the hypervisor. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-18-magnuskulke@linux.microsoft.com Signed-off-by: Paolo Bonzini --- target/i386/mshv/mshv-cpu.c | 117 ++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c index 8b10c79e54..0fe3cbb48d 100644 --- a/target/i386/mshv/mshv-cpu.c +++ b/target/i386/mshv/mshv-cpu.c @@ -12,6 +12,7 @@ #include "qemu/osdep.h" #include "qemu/error-report.h" +#include "qemu/memalign.h" #include "qemu/typedefs.h" #include "system/mshv.h" @@ -21,6 +22,7 @@ #include "hw/hyperv/hvgdk.h" #include "hw/hyperv/hvgdk_mini.h" #include "hw/hyperv/hvhdk_mini.h" +#include "hw/i386/apic_internal.h" #include "cpu.h" #include "emulate/x86_decode.h" @@ -562,6 +564,114 @@ static int set_cpu_state(const CPUState *cpu, const MshvFPU *fpu_regs, return 0; } +static int get_vp_state(int cpu_fd, struct mshv_get_set_vp_state *state) +{ + int ret; + + ret = ioctl(cpu_fd, MSHV_GET_VP_STATE, state); + if (ret < 0) { + error_report("failed to get partition state: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int get_lapic(int cpu_fd, + struct hv_local_interrupt_controller_state *state) +{ + int ret; + size_t size = 4096; + /* buffer aligned to 4k, as *state requires that */ + void *buffer = qemu_memalign(size, size); + struct mshv_get_set_vp_state mshv_state = { 0 }; + + mshv_state.buf_ptr = (uint64_t) buffer; + mshv_state.buf_sz = size; + mshv_state.type = MSHV_VP_STATE_LAPIC; + + ret = get_vp_state(cpu_fd, &mshv_state); + if (ret == 0) { + memcpy(state, buffer, sizeof(*state)); + } + qemu_vfree(buffer); + if (ret < 0) { + error_report("failed to get lapic"); + return -1; + } + + return 0; +} + +static uint32_t set_apic_delivery_mode(uint32_t reg, uint32_t mode) +{ + return ((reg) & ~0x700) | ((mode) << 8); +} + +static int set_vp_state(int cpu_fd, const struct mshv_get_set_vp_state *state) +{ + int ret; + + ret = ioctl(cpu_fd, MSHV_SET_VP_STATE, state); + if (ret < 0) { + error_report("failed to set partition state: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int set_lapic(int cpu_fd, + const struct hv_local_interrupt_controller_state *state) +{ + int ret; + size_t size = 4096; + /* buffer aligned to 4k, as *state requires that */ + void *buffer = qemu_memalign(size, size); + struct mshv_get_set_vp_state mshv_state = { 0 }; + + if (!state) { + error_report("lapic state is NULL"); + return -1; + } + memcpy(buffer, state, sizeof(*state)); + + mshv_state.buf_ptr = (uint64_t) buffer; + mshv_state.buf_sz = size; + mshv_state.type = MSHV_VP_STATE_LAPIC; + + ret = set_vp_state(cpu_fd, &mshv_state); + qemu_vfree(buffer); + if (ret < 0) { + error_report("failed to set lapic: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int set_lint(int cpu_fd) +{ + int ret; + uint32_t *lvt_lint0, *lvt_lint1; + + struct hv_local_interrupt_controller_state lapic_state = { 0 }; + ret = get_lapic(cpu_fd, &lapic_state); + if (ret < 0) { + return ret; + } + + lvt_lint0 = &lapic_state.apic_lvt_lint0; + *lvt_lint0 = set_apic_delivery_mode(*lvt_lint0, APIC_DM_EXTINT); + + lvt_lint1 = &lapic_state.apic_lvt_lint1; + *lvt_lint1 = set_apic_delivery_mode(*lvt_lint1, APIC_DM_NMI); + + /* TODO: should we skip setting lapic if the values are the same? */ + + return set_lapic(cpu_fd, &lapic_state); +} + /* * TODO: populate topology info: * @@ -573,6 +683,7 @@ int mshv_configure_vcpu(const CPUState *cpu, const struct MshvFPU *fpu, uint64_t xcr0) { int ret; + int cpu_fd = mshv_vcpufd(cpu); ret = set_cpu_state(cpu, fpu, xcr0); if (ret < 0) { @@ -580,6 +691,12 @@ int mshv_configure_vcpu(const CPUState *cpu, const struct MshvFPU *fpu, return -1; } + ret = set_lint(cpu_fd); + if (ret < 0) { + error_report("failed to set lpic int"); + return -1; + } + return 0; } From 4fa04dd16216e266564606b7da582e5fce07bced Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:38 +0200 Subject: [PATCH 24/35] target/i386/mshv: Register CPUID entries with MSHV Convert the guest CPU's CPUID model into MSHV's format and register it with the hypervisor. This ensures that the guest observes the correct CPU feature set during CPUID instructions. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-19-magnuskulke@linux.microsoft.com Signed-off-by: Paolo Bonzini --- target/i386/mshv/mshv-cpu.c | 206 ++++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c index 0fe3cbb48d..2b7a81274b 100644 --- a/target/i386/mshv/mshv-cpu.c +++ b/target/i386/mshv/mshv-cpu.c @@ -403,6 +403,206 @@ int mshv_load_regs(CPUState *cpu) return 0; } +static void add_cpuid_entry(GList *cpuid_entries, + uint32_t function, uint32_t index, + uint32_t eax, uint32_t ebx, + uint32_t ecx, uint32_t edx) +{ + struct hv_cpuid_entry *entry; + + entry = g_malloc0(sizeof(struct hv_cpuid_entry)); + entry->function = function; + entry->index = index; + entry->eax = eax; + entry->ebx = ebx; + entry->ecx = ecx; + entry->edx = edx; + + cpuid_entries = g_list_append(cpuid_entries, entry); +} + +static void collect_cpuid_entries(const CPUState *cpu, GList *cpuid_entries) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + uint32_t eax, ebx, ecx, edx; + uint32_t leaf, subleaf; + size_t max_leaf = 0x1F; + size_t max_subleaf = 0x20; + + uint32_t leaves_with_subleaves[] = {0x4, 0x7, 0xD, 0xF, 0x10}; + int n_subleaf_leaves = ARRAY_SIZE(leaves_with_subleaves); + + /* Regular leaves without subleaves */ + for (leaf = 0; leaf <= max_leaf; leaf++) { + bool has_subleaves = false; + for (int i = 0; i < n_subleaf_leaves; i++) { + if (leaf == leaves_with_subleaves[i]) { + has_subleaves = true; + break; + } + } + + if (!has_subleaves) { + cpu_x86_cpuid(env, leaf, 0, &eax, &ebx, &ecx, &edx); + if (eax == 0 && ebx == 0 && ecx == 0 && edx == 0) { + /* all zeroes indicates no more leaves */ + continue; + } + + add_cpuid_entry(cpuid_entries, leaf, 0, eax, ebx, ecx, edx); + continue; + } + + subleaf = 0; + while (subleaf < max_subleaf) { + cpu_x86_cpuid(env, leaf, subleaf, &eax, &ebx, &ecx, &edx); + + if (eax == 0 && ebx == 0 && ecx == 0 && edx == 0) { + /* all zeroes indicates no more leaves */ + break; + } + add_cpuid_entry(cpuid_entries, leaf, 0, eax, ebx, ecx, edx); + subleaf++; + } + } +} + +static int register_intercept_result_cpuid_entry(const CPUState *cpu, + uint8_t subleaf_specific, + uint8_t always_override, + struct hv_cpuid_entry *entry) +{ + int ret; + int vp_index = cpu->cpu_index; + int cpu_fd = mshv_vcpufd(cpu); + + struct hv_register_x64_cpuid_result_parameters cpuid_params = { + .input.eax = entry->function, + .input.ecx = entry->index, + .input.subleaf_specific = subleaf_specific, + .input.always_override = always_override, + .input.padding = 0, + /* + * With regard to masks - these are to specify bits to be overwritten + * The current CpuidEntry structure wouldn't allow to carry the masks + * in addition to the actual register values. For this reason, the + * masks are set to the exact values of the corresponding register bits + * to be registered for an overwrite. To view resulting values the + * hypervisor would return, HvCallGetVpCpuidValues hypercall can be + * used. + */ + .result.eax = entry->eax, + .result.eax_mask = entry->eax, + .result.ebx = entry->ebx, + .result.ebx_mask = entry->ebx, + .result.ecx = entry->ecx, + .result.ecx_mask = entry->ecx, + .result.edx = entry->edx, + .result.edx_mask = entry->edx, + }; + union hv_register_intercept_result_parameters parameters = { + .cpuid = cpuid_params, + }; + + hv_input_register_intercept_result in = {0}; + in.vp_index = vp_index; + in.intercept_type = HV_INTERCEPT_TYPE_X64_CPUID; + in.parameters = parameters; + + struct mshv_root_hvcall args = {0}; + args.code = HVCALL_REGISTER_INTERCEPT_RESULT; + args.in_sz = sizeof(in); + args.in_ptr = (uint64_t)∈ + + ret = mshv_hvcall(cpu_fd, &args); + if (ret < 0) { + error_report("failed to register intercept result for cpuid"); + return -1; + } + + return 0; +} + +static int register_intercept_result_cpuid(const CPUState *cpu, + struct hv_cpuid *cpuid) +{ + int ret = 0, entry_ret; + struct hv_cpuid_entry *entry; + uint8_t subleaf_specific, always_override; + + for (size_t i = 0; i < cpuid->nent; i++) { + entry = &cpuid->entries[i]; + + /* set defaults */ + subleaf_specific = 0; + always_override = 1; + + /* Intel */ + /* 0xb - Extended Topology Enumeration Leaf */ + /* 0x1f - V2 Extended Topology Enumeration Leaf */ + /* AMD */ + /* 0x8000_001e - Processor Topology Information */ + /* 0x8000_0026 - Extended CPU Topology */ + if (entry->function == 0xb + || entry->function == 0x1f + || entry->function == 0x8000001e + || entry->function == 0x80000026) { + subleaf_specific = 1; + always_override = 1; + } else if (entry->function == 0x00000001 + || entry->function == 0x80000000 + || entry->function == 0x80000001 + || entry->function == 0x80000008) { + subleaf_specific = 0; + always_override = 1; + } + + entry_ret = register_intercept_result_cpuid_entry(cpu, subleaf_specific, + always_override, + entry); + if ((entry_ret < 0) && (ret == 0)) { + ret = entry_ret; + } + } + + return ret; +} + +static int set_cpuid2(const CPUState *cpu) +{ + int ret; + size_t n_entries, cpuid_size; + struct hv_cpuid *cpuid; + struct hv_cpuid_entry *entry; + GList *entries = NULL; + + collect_cpuid_entries(cpu, entries); + n_entries = g_list_length(entries); + + cpuid_size = sizeof(struct hv_cpuid) + + n_entries * sizeof(struct hv_cpuid_entry); + + cpuid = g_malloc0(cpuid_size); + cpuid->nent = n_entries; + cpuid->padding = 0; + + for (size_t i = 0; i < n_entries; i++) { + entry = g_list_nth_data(entries, i); + cpuid->entries[i] = *entry; + g_free(entry); + } + g_list_free(entries); + + ret = register_intercept_result_cpuid(cpu, cpuid); + g_free(cpuid); + if (ret < 0) { + return ret; + } + + return 0; +} + static inline void populate_hv_segment_reg(SegmentCache *seg, hv_x64_segment_register *hv_reg) { @@ -685,6 +885,12 @@ int mshv_configure_vcpu(const CPUState *cpu, const struct MshvFPU *fpu, int ret; int cpu_fd = mshv_vcpufd(cpu); + ret = set_cpuid2(cpu); + if (ret < 0) { + error_report("failed to set cpuid"); + return -1; + } + ret = set_cpu_state(cpu, fpu, xcr0); if (ret < 0) { error_report("failed to set cpu state"); From f38e2a63e541730e114f6ed09e5f8719e388c8db Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Thu, 2 Oct 2025 18:19:22 +0200 Subject: [PATCH 25/35] target/i386/mshv: Register MSRs with MSHV Build and register the guest vCPU's model-specific registers using the MSHV interface. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-20-magnuskulke@linux.microsoft.com [mshv.h/mshv_int.h split. - Paolo] Signed-off-by: Paolo Bonzini --- accel/mshv/meson.build | 1 + accel/mshv/msr.c | 375 ++++++++++++++++++++++++++++++++++++ include/system/mshv_int.h | 17 ++ target/i386/cpu.h | 2 + target/i386/mshv/mshv-cpu.c | 33 ++++ 5 files changed, 428 insertions(+) create mode 100644 accel/mshv/msr.c diff --git a/accel/mshv/meson.build b/accel/mshv/meson.build index f88fc8678c..d3a2b32581 100644 --- a/accel/mshv/meson.build +++ b/accel/mshv/meson.build @@ -2,6 +2,7 @@ mshv_ss = ss.source_set() mshv_ss.add(if_true: files( 'irq.c', 'mem.c', + 'msr.c', 'mshv-all.c' )) diff --git a/accel/mshv/msr.c b/accel/mshv/msr.c new file mode 100644 index 0000000000..e6e5baef50 --- /dev/null +++ b/accel/mshv/msr.c @@ -0,0 +1,375 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: Magnus Kulke + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "system/mshv.h" +#include "system/mshv_int.h" +#include "hw/hyperv/hvgdk_mini.h" +#include "linux/mshv.h" +#include "qemu/error-report.h" + +static uint32_t supported_msrs[64] = { + IA32_MSR_TSC, + IA32_MSR_EFER, + IA32_MSR_KERNEL_GS_BASE, + IA32_MSR_APIC_BASE, + IA32_MSR_PAT, + IA32_MSR_SYSENTER_CS, + IA32_MSR_SYSENTER_ESP, + IA32_MSR_SYSENTER_EIP, + IA32_MSR_STAR, + IA32_MSR_LSTAR, + IA32_MSR_CSTAR, + IA32_MSR_SFMASK, + IA32_MSR_MTRR_DEF_TYPE, + IA32_MSR_MTRR_PHYSBASE0, + IA32_MSR_MTRR_PHYSMASK0, + IA32_MSR_MTRR_PHYSBASE1, + IA32_MSR_MTRR_PHYSMASK1, + IA32_MSR_MTRR_PHYSBASE2, + IA32_MSR_MTRR_PHYSMASK2, + IA32_MSR_MTRR_PHYSBASE3, + IA32_MSR_MTRR_PHYSMASK3, + IA32_MSR_MTRR_PHYSBASE4, + IA32_MSR_MTRR_PHYSMASK4, + IA32_MSR_MTRR_PHYSBASE5, + IA32_MSR_MTRR_PHYSMASK5, + IA32_MSR_MTRR_PHYSBASE6, + IA32_MSR_MTRR_PHYSMASK6, + IA32_MSR_MTRR_PHYSBASE7, + IA32_MSR_MTRR_PHYSMASK7, + IA32_MSR_MTRR_FIX64K_00000, + IA32_MSR_MTRR_FIX16K_80000, + IA32_MSR_MTRR_FIX16K_A0000, + IA32_MSR_MTRR_FIX4K_C0000, + IA32_MSR_MTRR_FIX4K_C8000, + IA32_MSR_MTRR_FIX4K_D0000, + IA32_MSR_MTRR_FIX4K_D8000, + IA32_MSR_MTRR_FIX4K_E0000, + IA32_MSR_MTRR_FIX4K_E8000, + IA32_MSR_MTRR_FIX4K_F0000, + IA32_MSR_MTRR_FIX4K_F8000, + IA32_MSR_TSC_AUX, + IA32_MSR_DEBUG_CTL, + HV_X64_MSR_GUEST_OS_ID, + HV_X64_MSR_SINT0, + HV_X64_MSR_SINT1, + HV_X64_MSR_SINT2, + HV_X64_MSR_SINT3, + HV_X64_MSR_SINT4, + HV_X64_MSR_SINT5, + HV_X64_MSR_SINT6, + HV_X64_MSR_SINT7, + HV_X64_MSR_SINT8, + HV_X64_MSR_SINT9, + HV_X64_MSR_SINT10, + HV_X64_MSR_SINT11, + HV_X64_MSR_SINT12, + HV_X64_MSR_SINT13, + HV_X64_MSR_SINT14, + HV_X64_MSR_SINT15, + HV_X64_MSR_SCONTROL, + HV_X64_MSR_SIEFP, + HV_X64_MSR_SIMP, + HV_X64_MSR_REFERENCE_TSC, + HV_X64_MSR_EOM, +}; +static const size_t msr_count = ARRAY_SIZE(supported_msrs); + +static int compare_msr_index(const void *a, const void *b) +{ + return *(uint32_t *)a - *(uint32_t *)b; +} + +__attribute__((constructor)) +static void init_sorted_msr_map(void) +{ + qsort(supported_msrs, msr_count, sizeof(uint32_t), compare_msr_index); +} + +static int mshv_is_supported_msr(uint32_t msr) +{ + return bsearch(&msr, supported_msrs, msr_count, sizeof(uint32_t), + compare_msr_index) != NULL; +} + +static int mshv_msr_to_hv_reg_name(uint32_t msr, uint32_t *hv_reg) +{ + switch (msr) { + case IA32_MSR_TSC: + *hv_reg = HV_X64_REGISTER_TSC; + return 0; + case IA32_MSR_EFER: + *hv_reg = HV_X64_REGISTER_EFER; + return 0; + case IA32_MSR_KERNEL_GS_BASE: + *hv_reg = HV_X64_REGISTER_KERNEL_GS_BASE; + return 0; + case IA32_MSR_APIC_BASE: + *hv_reg = HV_X64_REGISTER_APIC_BASE; + return 0; + case IA32_MSR_PAT: + *hv_reg = HV_X64_REGISTER_PAT; + return 0; + case IA32_MSR_SYSENTER_CS: + *hv_reg = HV_X64_REGISTER_SYSENTER_CS; + return 0; + case IA32_MSR_SYSENTER_ESP: + *hv_reg = HV_X64_REGISTER_SYSENTER_ESP; + return 0; + case IA32_MSR_SYSENTER_EIP: + *hv_reg = HV_X64_REGISTER_SYSENTER_EIP; + return 0; + case IA32_MSR_STAR: + *hv_reg = HV_X64_REGISTER_STAR; + return 0; + case IA32_MSR_LSTAR: + *hv_reg = HV_X64_REGISTER_LSTAR; + return 0; + case IA32_MSR_CSTAR: + *hv_reg = HV_X64_REGISTER_CSTAR; + return 0; + case IA32_MSR_SFMASK: + *hv_reg = HV_X64_REGISTER_SFMASK; + return 0; + case IA32_MSR_MTRR_CAP: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_CAP; + return 0; + case IA32_MSR_MTRR_DEF_TYPE: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_DEF_TYPE; + return 0; + case IA32_MSR_MTRR_PHYSBASE0: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0; + return 0; + case IA32_MSR_MTRR_PHYSMASK0: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0; + return 0; + case IA32_MSR_MTRR_PHYSBASE1: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1; + return 0; + case IA32_MSR_MTRR_PHYSMASK1: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1; + return 0; + case IA32_MSR_MTRR_PHYSBASE2: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2; + return 0; + case IA32_MSR_MTRR_PHYSMASK2: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2; + return 0; + case IA32_MSR_MTRR_PHYSBASE3: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3; + return 0; + case IA32_MSR_MTRR_PHYSMASK3: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3; + return 0; + case IA32_MSR_MTRR_PHYSBASE4: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4; + return 0; + case IA32_MSR_MTRR_PHYSMASK4: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4; + return 0; + case IA32_MSR_MTRR_PHYSBASE5: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5; + return 0; + case IA32_MSR_MTRR_PHYSMASK5: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5; + return 0; + case IA32_MSR_MTRR_PHYSBASE6: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6; + return 0; + case IA32_MSR_MTRR_PHYSMASK6: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6; + return 0; + case IA32_MSR_MTRR_PHYSBASE7: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7; + return 0; + case IA32_MSR_MTRR_PHYSMASK7: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7; + return 0; + case IA32_MSR_MTRR_FIX64K_00000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX64K00000; + return 0; + case IA32_MSR_MTRR_FIX16K_80000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX16K80000; + return 0; + case IA32_MSR_MTRR_FIX16K_A0000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX16KA0000; + return 0; + case IA32_MSR_MTRR_FIX4K_C0000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KC0000; + return 0; + case IA32_MSR_MTRR_FIX4K_C8000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KC8000; + return 0; + case IA32_MSR_MTRR_FIX4K_D0000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KD0000; + return 0; + case IA32_MSR_MTRR_FIX4K_D8000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KD8000; + return 0; + case IA32_MSR_MTRR_FIX4K_E0000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KE0000; + return 0; + case IA32_MSR_MTRR_FIX4K_E8000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KE8000; + return 0; + case IA32_MSR_MTRR_FIX4K_F0000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KF0000; + return 0; + case IA32_MSR_MTRR_FIX4K_F8000: + *hv_reg = HV_X64_REGISTER_MSR_MTRR_FIX4KF8000; + return 0; + case IA32_MSR_TSC_AUX: + *hv_reg = HV_X64_REGISTER_TSC_AUX; + return 0; + case IA32_MSR_BNDCFGS: + *hv_reg = HV_X64_REGISTER_BNDCFGS; + return 0; + case IA32_MSR_DEBUG_CTL: + *hv_reg = HV_X64_REGISTER_DEBUG_CTL; + return 0; + case IA32_MSR_TSC_ADJUST: + *hv_reg = HV_X64_REGISTER_TSC_ADJUST; + return 0; + case IA32_MSR_SPEC_CTRL: + *hv_reg = HV_X64_REGISTER_SPEC_CTRL; + return 0; + case HV_X64_MSR_GUEST_OS_ID: + *hv_reg = HV_REGISTER_GUEST_OS_ID; + return 0; + case HV_X64_MSR_SINT0: + *hv_reg = HV_REGISTER_SINT0; + return 0; + case HV_X64_MSR_SINT1: + *hv_reg = HV_REGISTER_SINT1; + return 0; + case HV_X64_MSR_SINT2: + *hv_reg = HV_REGISTER_SINT2; + return 0; + case HV_X64_MSR_SINT3: + *hv_reg = HV_REGISTER_SINT3; + return 0; + case HV_X64_MSR_SINT4: + *hv_reg = HV_REGISTER_SINT4; + return 0; + case HV_X64_MSR_SINT5: + *hv_reg = HV_REGISTER_SINT5; + return 0; + case HV_X64_MSR_SINT6: + *hv_reg = HV_REGISTER_SINT6; + return 0; + case HV_X64_MSR_SINT7: + *hv_reg = HV_REGISTER_SINT7; + return 0; + case HV_X64_MSR_SINT8: + *hv_reg = HV_REGISTER_SINT8; + return 0; + case HV_X64_MSR_SINT9: + *hv_reg = HV_REGISTER_SINT9; + return 0; + case HV_X64_MSR_SINT10: + *hv_reg = HV_REGISTER_SINT10; + return 0; + case HV_X64_MSR_SINT11: + *hv_reg = HV_REGISTER_SINT11; + return 0; + case HV_X64_MSR_SINT12: + *hv_reg = HV_REGISTER_SINT12; + return 0; + case HV_X64_MSR_SINT13: + *hv_reg = HV_REGISTER_SINT13; + return 0; + case HV_X64_MSR_SINT14: + *hv_reg = HV_REGISTER_SINT14; + return 0; + case HV_X64_MSR_SINT15: + *hv_reg = HV_REGISTER_SINT15; + return 0; + case IA32_MSR_MISC_ENABLE: + *hv_reg = HV_X64_REGISTER_MSR_IA32_MISC_ENABLE; + return 0; + case HV_X64_MSR_SCONTROL: + *hv_reg = HV_REGISTER_SCONTROL; + return 0; + case HV_X64_MSR_SIEFP: + *hv_reg = HV_REGISTER_SIEFP; + return 0; + case HV_X64_MSR_SIMP: + *hv_reg = HV_REGISTER_SIMP; + return 0; + case HV_X64_MSR_REFERENCE_TSC: + *hv_reg = HV_REGISTER_REFERENCE_TSC; + return 0; + case HV_X64_MSR_EOM: + *hv_reg = HV_REGISTER_EOM; + return 0; + default: + error_report("failed to map MSR %u to HV register name", msr); + return -1; + } +} + +static int set_msrs(const CPUState *cpu, GList *msrs) +{ + size_t n_msrs; + GList *entries; + MshvMsrEntry *entry; + enum hv_register_name name; + struct hv_register_assoc *assoc; + int ret; + size_t i = 0; + + n_msrs = g_list_length(msrs); + hv_register_assoc *assocs = g_new0(hv_register_assoc, n_msrs); + + entries = msrs; + for (const GList *elem = entries; elem != NULL; elem = elem->next) { + entry = elem->data; + ret = mshv_msr_to_hv_reg_name(entry->index, &name); + if (ret < 0) { + g_free(assocs); + return ret; + } + assoc = &assocs[i]; + assoc->name = name; + /* the union has been initialized to 0 */ + assoc->value.reg64 = entry->data; + i++; + } + ret = mshv_set_generic_regs(cpu, assocs, n_msrs); + g_free(assocs); + if (ret < 0) { + error_report("failed to set msrs"); + return -1; + } + return 0; +} + + +int mshv_configure_msr(const CPUState *cpu, const MshvMsrEntry *msrs, + size_t n_msrs) +{ + GList *valid_msrs = NULL; + uint32_t msr_index; + int ret; + + for (size_t i = 0; i < n_msrs; i++) { + msr_index = msrs[i].index; + /* check whether index of msrs is in SUPPORTED_MSRS */ + if (mshv_is_supported_msr(msr_index)) { + valid_msrs = g_list_append(valid_msrs, (void *) &msrs[i]); + } + } + + ret = set_msrs(cpu, valid_msrs); + g_list_free(valid_msrs); + + return ret; +} diff --git a/include/system/mshv_int.h b/include/system/mshv_int.h index 0ea8d504fa..6649438313 100644 --- a/include/system/mshv_int.h +++ b/include/system/mshv_int.h @@ -14,6 +14,8 @@ #ifndef QEMU_MSHV_INT_H #define QEMU_MSHV_INT_H +#define MSHV_MSR_ENTRIES_COUNT 64 + typedef struct hyperv_message hv_message; struct AccelCPUState { @@ -102,6 +104,21 @@ typedef struct MshvMemoryRegion { void mshv_set_phys_mem(MshvMemoryListener *mml, MemoryRegionSection *section, bool add); +/* msr */ +typedef struct MshvMsrEntry { + uint32_t index; + uint32_t reserved; + uint64_t data; +} MshvMsrEntry; + +typedef struct MshvMsrEntries { + MshvMsrEntry entries[MSHV_MSR_ENTRIES_COUNT]; + uint32_t nmsrs; +} MshvMsrEntries; + +int mshv_configure_msr(const CPUState *cpu, const MshvMsrEntry *msrs, + size_t n_msrs); + /* interrupt */ void mshv_init_msicontrol(void); int mshv_reserve_ioapic_msi_routes(int vm_fd); diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 3aec8fd41c..8b7c173838 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -435,9 +435,11 @@ typedef enum X86Seg { #define MSR_SMI_COUNT 0x34 #define MSR_CORE_THREAD_COUNT 0x35 #define MSR_MTRRcap 0xfe +#define MSR_MTRR_MEM_TYPE_WB 0x06 #define MSR_MTRRcap_VCNT 8 #define MSR_MTRRcap_FIXRANGE_SUPPORT (1 << 8) #define MSR_MTRRcap_WC_SUPPORTED (1 << 10) +#define MSR_MTRR_ENABLE (1 << 11) #define MSR_IA32_SYSENTER_CS 0x174 #define MSR_IA32_SYSENTER_ESP 0x175 diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c index 2b7a81274b..1f43dfc58a 100644 --- a/target/i386/mshv/mshv-cpu.c +++ b/target/i386/mshv/mshv-cpu.c @@ -872,6 +872,33 @@ static int set_lint(int cpu_fd) return set_lapic(cpu_fd, &lapic_state); } +static int setup_msrs(const CPUState *cpu) +{ + int ret; + uint64_t default_type = MSR_MTRR_ENABLE | MSR_MTRR_MEM_TYPE_WB; + + /* boot msr entries */ + MshvMsrEntry msrs[9] = { + { .index = IA32_MSR_SYSENTER_CS, .data = 0x0, }, + { .index = IA32_MSR_SYSENTER_ESP, .data = 0x0, }, + { .index = IA32_MSR_SYSENTER_EIP, .data = 0x0, }, + { .index = IA32_MSR_STAR, .data = 0x0, }, + { .index = IA32_MSR_CSTAR, .data = 0x0, }, + { .index = IA32_MSR_LSTAR, .data = 0x0, }, + { .index = IA32_MSR_KERNEL_GS_BASE, .data = 0x0, }, + { .index = IA32_MSR_SFMASK, .data = 0x0, }, + { .index = IA32_MSR_MTRR_DEF_TYPE, .data = default_type, }, + }; + + ret = mshv_configure_msr(cpu, msrs, 9); + if (ret < 0) { + error_report("failed to setup msrs"); + return -1; + } + + return 0; +} + /* * TODO: populate topology info: * @@ -891,6 +918,12 @@ int mshv_configure_vcpu(const CPUState *cpu, const struct MshvFPU *fpu, return -1; } + ret = setup_msrs(cpu); + if (ret < 0) { + error_report("failed to setup msrs"); + return -1; + } + ret = set_cpu_state(cpu, fpu, xcr0); if (ret < 0) { error_report("failed to set cpu state"); From 9bc6a1d29605a13c541629a651e41787af65a963 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:40 +0200 Subject: [PATCH 26/35] target/i386/mshv: Integrate x86 instruction decoder/emulator Connect the x86 instruction decoder and emulator to the MSHV backend to handle intercepted instructions. This enables software emulation of MMIO operations in MSHV guests. MSHV has a translate_gva hypercall that is used to accessing the physical guest memory. A guest might read from unmapped memory regions (e.g. OVMF will probe 0xfed40000 for a vTPM). In those cases 0xFF bytes is returned instead of aborting the execution. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-21-magnuskulke@linux.microsoft.com [mshv.h/mshv_int.h split. - Paolo] Signed-off-by: Paolo Bonzini --- accel/mshv/mem.c | 65 +++++++++++++++++ include/system/mshv_int.h | 4 ++ target/i386/mshv/mshv-cpu.c | 135 ++++++++++++++++++++++++++++++++++++ 3 files changed, 204 insertions(+) diff --git a/accel/mshv/mem.c b/accel/mshv/mem.c index a0a40eb333..e55c38d4db 100644 --- a/accel/mshv/mem.c +++ b/accel/mshv/mem.c @@ -59,6 +59,71 @@ static int map_or_unmap(int vm_fd, const MshvMemoryRegion *mr, bool map) return set_guest_memory(vm_fd, ®ion); } +static int handle_unmapped_mmio_region_read(uint64_t gpa, uint64_t size, + uint8_t *data) +{ + warn_report("read from unmapped mmio region gpa=0x%lx size=%lu", gpa, size); + + if (size == 0 || size > 8) { + error_report("invalid size %lu for reading from unmapped mmio region", + size); + return -1; + } + + memset(data, 0xFF, size); + + return 0; +} + +int mshv_guest_mem_read(uint64_t gpa, uint8_t *data, uintptr_t size, + bool is_secure_mode, bool instruction_fetch) +{ + int ret; + MemTxAttrs memattr = { .secure = is_secure_mode }; + + if (instruction_fetch) { + trace_mshv_insn_fetch(gpa, size); + } else { + trace_mshv_mem_read(gpa, size); + } + + ret = address_space_rw(&address_space_memory, gpa, memattr, (void *)data, + size, false); + if (ret == MEMTX_OK) { + return 0; + } + + if (ret == MEMTX_DECODE_ERROR) { + return handle_unmapped_mmio_region_read(gpa, size, data); + } + + error_report("failed to read guest memory at 0x%lx", gpa); + return -1; +} + +int mshv_guest_mem_write(uint64_t gpa, const uint8_t *data, uintptr_t size, + bool is_secure_mode) +{ + int ret; + MemTxAttrs memattr = { .secure = is_secure_mode }; + + trace_mshv_mem_write(gpa, size); + ret = address_space_rw(&address_space_memory, gpa, memattr, (void *)data, + size, true); + if (ret == MEMTX_OK) { + return 0; + } + + if (ret == MEMTX_DECODE_ERROR) { + warn_report("write to unmapped mmio region gpa=0x%lx size=%lu", gpa, + size); + return 0; + } + + error_report("Failed to write guest memory"); + return -1; +} + static int set_memory(const MshvMemoryRegion *mshv_mr, bool add) { int ret = 0; diff --git a/include/system/mshv_int.h b/include/system/mshv_int.h index 6649438313..b29d39911d 100644 --- a/include/system/mshv_int.h +++ b/include/system/mshv_int.h @@ -101,6 +101,10 @@ typedef struct MshvMemoryRegion { bool readonly; } MshvMemoryRegion; +int mshv_guest_mem_read(uint64_t gpa, uint8_t *data, uintptr_t size, + bool is_secure_mode, bool instruction_fetch); +int mshv_guest_mem_write(uint64_t gpa, const uint8_t *data, uintptr_t size, + bool is_secure_mode); void mshv_set_phys_mem(MshvMemoryListener *mml, MemoryRegionSection *section, bool add); diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c index 1f43dfc58a..424ebdb122 100644 --- a/target/i386/mshv/mshv-cpu.c +++ b/target/i386/mshv/mshv-cpu.c @@ -104,6 +104,47 @@ static enum hv_register_name FPU_REGISTER_NAMES[26] = { HV_X64_REGISTER_XMM_CONTROL_STATUS, }; +static int translate_gva(const CPUState *cpu, uint64_t gva, uint64_t *gpa, + uint64_t flags) +{ + int ret; + int cpu_fd = mshv_vcpufd(cpu); + int vp_index = cpu->cpu_index; + + hv_input_translate_virtual_address in = { 0 }; + hv_output_translate_virtual_address out = { 0 }; + struct mshv_root_hvcall args = {0}; + uint64_t gva_page = gva >> HV_HYP_PAGE_SHIFT; + + in.vp_index = vp_index; + in.control_flags = flags; + in.gva_page = gva_page; + + /* create the hvcall envelope */ + args.code = HVCALL_TRANSLATE_VIRTUAL_ADDRESS; + args.in_sz = sizeof(in); + args.in_ptr = (uint64_t) ∈ + args.out_sz = sizeof(out); + args.out_ptr = (uint64_t) &out; + + /* perform the call */ + ret = mshv_hvcall(cpu_fd, &args); + if (ret < 0) { + error_report("Failed to invoke gva->gpa translation"); + return -errno; + } + + if (out.translation_result.result_code != HV_TRANSLATE_GVA_SUCCESS) { + error_report("Failed to translate gva (" TARGET_FMT_lx ") to gpa", gva); + return -1; + } + + *gpa = ((out.gpa_page << HV_HYP_PAGE_SHIFT) + | (gva & ~(uint64_t)HV_HYP_PAGE_MASK)); + + return 0; +} + int mshv_set_generic_regs(const CPUState *cpu, const hv_register_assoc *assocs, size_t n_regs) { @@ -1006,8 +1047,102 @@ int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd) return 0; } +static int guest_mem_read_with_gva(const CPUState *cpu, uint64_t gva, + uint8_t *data, uintptr_t size, + bool fetch_instruction) +{ + int ret; + uint64_t gpa, flags; + + flags = HV_TRANSLATE_GVA_VALIDATE_READ; + ret = translate_gva(cpu, gva, &gpa, flags); + if (ret < 0) { + error_report("failed to translate gva to gpa"); + return -1; + } + + ret = mshv_guest_mem_read(gpa, data, size, false, fetch_instruction); + if (ret < 0) { + error_report("failed to read from guest memory"); + return -1; + } + + return 0; +} + +static int guest_mem_write_with_gva(const CPUState *cpu, uint64_t gva, + const uint8_t *data, uintptr_t size) +{ + int ret; + uint64_t gpa, flags; + + flags = HV_TRANSLATE_GVA_VALIDATE_WRITE; + ret = translate_gva(cpu, gva, &gpa, flags); + if (ret < 0) { + error_report("failed to translate gva to gpa"); + return -1; + } + ret = mshv_guest_mem_write(gpa, data, size, false); + if (ret < 0) { + error_report("failed to write to guest memory"); + return -1; + } + return 0; +} + +static void write_mem(CPUState *cpu, void *data, target_ulong addr, int bytes) +{ + if (guest_mem_write_with_gva(cpu, addr, data, bytes) < 0) { + error_report("failed to write memory"); + abort(); + } +} + +static void fetch_instruction(CPUState *cpu, void *data, + target_ulong addr, int bytes) +{ + if (guest_mem_read_with_gva(cpu, addr, data, bytes, true) < 0) { + error_report("failed to fetch instruction"); + abort(); + } +} + +static void read_mem(CPUState *cpu, void *data, target_ulong addr, int bytes) +{ + if (guest_mem_read_with_gva(cpu, addr, data, bytes, false) < 0) { + error_report("failed to read memory"); + abort(); + } +} + +static void read_segment_descriptor(CPUState *cpu, + struct x86_segment_descriptor *desc, + enum X86Seg seg_idx) +{ + bool ret; + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + SegmentCache *seg = &env->segs[seg_idx]; + x86_segment_selector sel = { .sel = seg->selector & 0xFFFF }; + + ret = x86_read_segment_descriptor(cpu, desc, sel); + if (ret == false) { + error_report("failed to read segment descriptor"); + abort(); + } +} + +static const struct x86_emul_ops mshv_x86_emul_ops = { + .fetch_instruction = fetch_instruction, + .read_mem = read_mem, + .write_mem = write_mem, + .read_segment_descriptor = read_segment_descriptor, +}; + void mshv_init_mmio_emu(void) { + init_decoder(); + init_emu(&mshv_x86_emul_ops); } void mshv_arch_init_vcpu(CPUState *cpu) From 64118f452cbd97cd9fa790b1c15b65b435f136d2 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:41 +0200 Subject: [PATCH 27/35] target/i386/mshv: Write MSRs to the hypervisor Push current model-specific register (MSR) values to MSHV's vCPUs as part of setting state to the hypervisor. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-22-magnuskulke@linux.microsoft.com Signed-off-by: Paolo Bonzini --- target/i386/mshv/mshv-cpu.c | 68 +++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c index 424ebdb122..33a3ce8b11 100644 --- a/target/i386/mshv/mshv-cpu.c +++ b/target/i386/mshv/mshv-cpu.c @@ -998,6 +998,65 @@ static int put_regs(const CPUState *cpu) return 0; } +struct MsrPair { + uint32_t index; + uint64_t value; +}; + +static int put_msrs(const CPUState *cpu) +{ + int ret = 0; + X86CPU *x86cpu = X86_CPU(cpu); + CPUX86State *env = &x86cpu->env; + MshvMsrEntries *msrs = g_malloc0(sizeof(MshvMsrEntries)); + + struct MsrPair pairs[] = { + { MSR_IA32_SYSENTER_CS, env->sysenter_cs }, + { MSR_IA32_SYSENTER_ESP, env->sysenter_esp }, + { MSR_IA32_SYSENTER_EIP, env->sysenter_eip }, + { MSR_EFER, env->efer }, + { MSR_PAT, env->pat }, + { MSR_STAR, env->star }, + { MSR_CSTAR, env->cstar }, + { MSR_LSTAR, env->lstar }, + { MSR_KERNELGSBASE, env->kernelgsbase }, + { MSR_FMASK, env->fmask }, + { MSR_MTRRdefType, env->mtrr_deftype }, + { MSR_VM_HSAVE_PA, env->vm_hsave }, + { MSR_SMI_COUNT, env->msr_smi_count }, + { MSR_IA32_PKRS, env->pkrs }, + { MSR_IA32_BNDCFGS, env->msr_bndcfgs }, + { MSR_IA32_XSS, env->xss }, + { MSR_IA32_UMWAIT_CONTROL, env->umwait }, + { MSR_IA32_TSX_CTRL, env->tsx_ctrl }, + { MSR_AMD64_TSC_RATIO, env->amd_tsc_scale_msr }, + { MSR_TSC_AUX, env->tsc_aux }, + { MSR_TSC_ADJUST, env->tsc_adjust }, + { MSR_IA32_SMBASE, env->smbase }, + { MSR_IA32_SPEC_CTRL, env->spec_ctrl }, + { MSR_VIRT_SSBD, env->virt_ssbd }, + }; + + if (ARRAY_SIZE(pairs) > MSHV_MSR_ENTRIES_COUNT) { + error_report("MSR entries exceed maximum size"); + g_free(msrs); + return -1; + } + + for (size_t i = 0; i < ARRAY_SIZE(pairs); i++) { + MshvMsrEntry *entry = &msrs->entries[i]; + entry->index = pairs[i].index; + entry->reserved = 0; + entry->data = pairs[i].value; + msrs->nmsrs++; + } + + ret = mshv_configure_msr(cpu, &msrs->entries[0], msrs->nmsrs); + g_free(msrs); + return ret; +} + + int mshv_arch_put_registers(const CPUState *cpu) { int ret; @@ -1008,8 +1067,13 @@ int mshv_arch_put_registers(const CPUState *cpu) return -1; } - error_report("unimplemented"); - abort(); + ret = put_msrs(cpu); + if (ret < 0) { + error_report("Failed to put msrs"); + return -1; + } + + return 0; } void mshv_arch_amend_proc_features( From 6dec60528c419781f1f79cd9837d5f26415a3fd7 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:42 +0200 Subject: [PATCH 28/35] target/i386/mshv: Implement mshv_vcpu_run() Add the main vCPU execution loop for MSHV using the MSHV_RUN_VP ioctl. The execution loop handles guest entry and VM exits. There are handlers for memory r/w, PIO and MMIO to which the exit events are dispatched. In case of MMIO the i386 instruction decoder/emulator is invoked to perform the operation in user space. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-23-magnuskulke@linux.microsoft.com Signed-off-by: Paolo Bonzini --- target/i386/mshv/mshv-cpu.c | 444 +++++++++++++++++++++++++++++++++++- 1 file changed, 442 insertions(+), 2 deletions(-) diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c index 33a3ce8b11..7edc032cea 100644 --- a/target/i386/mshv/mshv-cpu.c +++ b/target/i386/mshv/mshv-cpu.c @@ -1082,10 +1082,450 @@ void mshv_arch_amend_proc_features( features->access_guest_idle_reg = 1; } +static int set_memory_info(const struct hyperv_message *msg, + struct hv_x64_memory_intercept_message *info) +{ + if (msg->header.message_type != HVMSG_GPA_INTERCEPT + && msg->header.message_type != HVMSG_UNMAPPED_GPA + && msg->header.message_type != HVMSG_UNACCEPTED_GPA) { + error_report("invalid message type"); + return -1; + } + memcpy(info, msg->payload, sizeof(*info)); + + return 0; +} + +static int emulate_instruction(CPUState *cpu, + const uint8_t *insn_bytes, size_t insn_len, + uint64_t gva, uint64_t gpa) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + struct x86_decode decode = { 0 }; + int ret; + x86_insn_stream stream = { .bytes = insn_bytes, .len = insn_len }; + + ret = mshv_load_regs(cpu); + if (ret < 0) { + error_report("failed to load registers"); + return -1; + } + + decode_instruction_stream(env, &decode, &stream); + exec_instruction(env, &decode); + + ret = mshv_store_regs(cpu); + if (ret < 0) { + error_report("failed to store registers"); + return -1; + } + + return 0; +} + +static int handle_mmio(CPUState *cpu, const struct hyperv_message *msg, + MshvVmExit *exit_reason) +{ + struct hv_x64_memory_intercept_message info = { 0 }; + size_t insn_len; + uint8_t access_type; + uint8_t *instruction_bytes; + int ret; + + ret = set_memory_info(msg, &info); + if (ret < 0) { + error_report("failed to convert message to memory info"); + return -1; + } + insn_len = info.instruction_byte_count; + access_type = info.header.intercept_access_type; + + if (access_type == HV_X64_INTERCEPT_ACCESS_TYPE_EXECUTE) { + error_report("invalid intercept access type: execute"); + return -1; + } + + if (insn_len > 16) { + error_report("invalid mmio instruction length: %zu", insn_len); + return -1; + } + + trace_mshv_handle_mmio(info.guest_virtual_address, + info.guest_physical_address, + info.instruction_byte_count, access_type); + + instruction_bytes = info.instruction_bytes; + + ret = emulate_instruction(cpu, instruction_bytes, insn_len, + info.guest_virtual_address, + info.guest_physical_address); + if (ret < 0) { + error_report("failed to emulate mmio"); + return -1; + } + + *exit_reason = MshvVmExitIgnore; + + return 0; +} + +static int set_ioport_info(const struct hyperv_message *msg, + hv_x64_io_port_intercept_message *info) +{ + if (msg->header.message_type != HVMSG_X64_IO_PORT_INTERCEPT) { + error_report("Invalid message type"); + return -1; + } + memcpy(info, msg->payload, sizeof(*info)); + + return 0; +} + +static int set_x64_registers(const CPUState *cpu, const uint32_t *names, + const uint64_t *values) +{ + + hv_register_assoc assocs[2]; + int ret; + + for (size_t i = 0; i < ARRAY_SIZE(assocs); i++) { + assocs[i].name = names[i]; + assocs[i].value.reg64 = values[i]; + } + + ret = mshv_set_generic_regs(cpu, assocs, ARRAY_SIZE(assocs)); + if (ret < 0) { + error_report("failed to set x64 registers"); + return -1; + } + + return 0; +} + +static inline MemTxAttrs get_mem_attrs(bool is_secure_mode) +{ + MemTxAttrs memattr = {0}; + memattr.secure = is_secure_mode; + return memattr; +} + +static void pio_read(uint64_t port, uint8_t *data, uintptr_t size, + bool is_secure_mode) +{ + int ret = 0; + MemTxAttrs memattr = get_mem_attrs(is_secure_mode); + ret = address_space_rw(&address_space_io, port, memattr, (void *)data, size, + false); + if (ret != MEMTX_OK) { + error_report("Failed to read from port %lx: %d", port, ret); + abort(); + } +} + +static int pio_write(uint64_t port, const uint8_t *data, uintptr_t size, + bool is_secure_mode) +{ + int ret = 0; + MemTxAttrs memattr = get_mem_attrs(is_secure_mode); + ret = address_space_rw(&address_space_io, port, memattr, (void *)data, size, + true); + return ret; +} + +static int handle_pio_non_str(const CPUState *cpu, + hv_x64_io_port_intercept_message *info) +{ + size_t len = info->access_info.access_size; + uint8_t access_type = info->header.intercept_access_type; + int ret; + uint32_t val, eax; + const uint32_t eax_mask = 0xffffffffu >> (32 - len * 8); + size_t insn_len; + uint64_t rip, rax; + uint32_t reg_names[2]; + uint64_t reg_values[2]; + uint16_t port = info->port_number; + + if (access_type == HV_X64_INTERCEPT_ACCESS_TYPE_WRITE) { + union { + uint32_t u32; + uint8_t bytes[4]; + } conv; + + /* convert the first 4 bytes of rax to bytes */ + conv.u32 = (uint32_t)info->rax; + /* secure mode is set to false */ + ret = pio_write(port, conv.bytes, len, false); + if (ret < 0) { + error_report("Failed to write to io port"); + return -1; + } + } else { + uint8_t data[4] = { 0 }; + /* secure mode is set to false */ + pio_read(info->port_number, data, len, false); + + /* Preserve high bits in EAX, but clear out high bits in RAX */ + val = *(uint32_t *)data; + eax = (((uint32_t)info->rax) & ~eax_mask) | (val & eax_mask); + info->rax = (uint64_t)eax; + } + + insn_len = info->header.instruction_length; + + /* Advance RIP and update RAX */ + rip = info->header.rip + insn_len; + rax = info->rax; + + reg_names[0] = HV_X64_REGISTER_RIP; + reg_values[0] = rip; + reg_names[1] = HV_X64_REGISTER_RAX; + reg_values[1] = rax; + + ret = set_x64_registers(cpu, reg_names, reg_values); + if (ret < 0) { + error_report("Failed to set x64 registers"); + return -1; + } + + cpu->accel->dirty = false; + + return 0; +} + +static int fetch_guest_state(CPUState *cpu) +{ + int ret; + + ret = mshv_get_standard_regs(cpu); + if (ret < 0) { + error_report("Failed to get standard registers"); + return -1; + } + + ret = mshv_get_special_regs(cpu); + if (ret < 0) { + error_report("Failed to get special registers"); + return -1; + } + + return 0; +} + +static int read_memory(const CPUState *cpu, uint64_t initial_gva, + uint64_t initial_gpa, uint64_t gva, uint8_t *data, + size_t len) +{ + int ret; + uint64_t gpa, flags; + + if (gva == initial_gva) { + gpa = initial_gpa; + } else { + flags = HV_TRANSLATE_GVA_VALIDATE_READ; + ret = translate_gva(cpu, gva, &gpa, flags); + if (ret < 0) { + return -1; + } + + ret = mshv_guest_mem_read(gpa, data, len, false, false); + if (ret < 0) { + error_report("failed to read guest mem"); + return -1; + } + } + + return 0; +} + +static int write_memory(const CPUState *cpu, uint64_t initial_gva, + uint64_t initial_gpa, uint64_t gva, const uint8_t *data, + size_t len) +{ + int ret; + uint64_t gpa, flags; + + if (gva == initial_gva) { + gpa = initial_gpa; + } else { + flags = HV_TRANSLATE_GVA_VALIDATE_WRITE; + ret = translate_gva(cpu, gva, &gpa, flags); + if (ret < 0) { + error_report("failed to translate gva to gpa"); + return -1; + } + } + ret = mshv_guest_mem_write(gpa, data, len, false); + if (ret != MEMTX_OK) { + error_report("failed to write to mmio"); + return -1; + } + + return 0; +} + +static int handle_pio_str_write(CPUState *cpu, + hv_x64_io_port_intercept_message *info, + size_t repeat, uint16_t port, + bool direction_flag) +{ + int ret; + uint64_t src; + uint8_t data[4] = { 0 }; + size_t len = info->access_info.access_size; + + src = linear_addr(cpu, info->rsi, R_DS); + + for (size_t i = 0; i < repeat; i++) { + ret = read_memory(cpu, 0, 0, src, data, len); + if (ret < 0) { + error_report("Failed to read memory"); + return -1; + } + ret = pio_write(port, data, len, false); + if (ret < 0) { + error_report("Failed to write to io port"); + return -1; + } + src += direction_flag ? -len : len; + info->rsi += direction_flag ? -len : len; + } + + return 0; +} + +static int handle_pio_str_read(CPUState *cpu, + hv_x64_io_port_intercept_message *info, + size_t repeat, uint16_t port, + bool direction_flag) +{ + int ret; + uint64_t dst; + size_t len = info->access_info.access_size; + uint8_t data[4] = { 0 }; + + dst = linear_addr(cpu, info->rdi, R_ES); + + for (size_t i = 0; i < repeat; i++) { + pio_read(port, data, len, false); + + ret = write_memory(cpu, 0, 0, dst, data, len); + if (ret < 0) { + error_report("Failed to write memory"); + return -1; + } + dst += direction_flag ? -len : len; + info->rdi += direction_flag ? -len : len; + } + + return 0; +} + +static int handle_pio_str(CPUState *cpu, hv_x64_io_port_intercept_message *info) +{ + uint8_t access_type = info->header.intercept_access_type; + uint16_t port = info->port_number; + bool repop = info->access_info.rep_prefix == 1; + size_t repeat = repop ? info->rcx : 1; + size_t insn_len = info->header.instruction_length; + bool direction_flag; + uint32_t reg_names[3]; + uint64_t reg_values[3]; + int ret; + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + + ret = fetch_guest_state(cpu); + if (ret < 0) { + error_report("Failed to fetch guest state"); + return -1; + } + + direction_flag = (env->eflags & DESC_E_MASK) != 0; + + if (access_type == HV_X64_INTERCEPT_ACCESS_TYPE_WRITE) { + ret = handle_pio_str_write(cpu, info, repeat, port, direction_flag); + if (ret < 0) { + error_report("Failed to handle pio str write"); + return -1; + } + reg_names[0] = HV_X64_REGISTER_RSI; + reg_values[0] = info->rsi; + } else { + ret = handle_pio_str_read(cpu, info, repeat, port, direction_flag); + reg_names[0] = HV_X64_REGISTER_RDI; + reg_values[0] = info->rdi; + } + + reg_names[1] = HV_X64_REGISTER_RIP; + reg_values[1] = info->header.rip + insn_len; + reg_names[2] = HV_X64_REGISTER_RAX; + reg_values[2] = info->rax; + + ret = set_x64_registers(cpu, reg_names, reg_values); + if (ret < 0) { + error_report("Failed to set x64 registers"); + return -1; + } + + cpu->accel->dirty = false; + + return 0; +} + +static int handle_pio(CPUState *cpu, const struct hyperv_message *msg) +{ + struct hv_x64_io_port_intercept_message info = { 0 }; + int ret; + + ret = set_ioport_info(msg, &info); + if (ret < 0) { + error_report("Failed to convert message to ioport info"); + return -1; + } + + if (info.access_info.string_op) { + return handle_pio_str(cpu, &info); + } + + return handle_pio_non_str(cpu, &info); +} + int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit) { - error_report("unimplemented"); - abort(); + int ret; + enum MshvVmExit exit_reason; + int cpu_fd = mshv_vcpufd(cpu); + + ret = ioctl(cpu_fd, MSHV_RUN_VP, msg); + if (ret < 0) { + return MshvVmExitShutdown; + } + + switch (msg->header.message_type) { + case HVMSG_UNRECOVERABLE_EXCEPTION: + return MshvVmExitShutdown; + case HVMSG_UNMAPPED_GPA: + case HVMSG_GPA_INTERCEPT: + ret = handle_mmio(cpu, msg, &exit_reason); + if (ret < 0) { + error_report("failed to handle mmio"); + return -1; + } + return exit_reason; + case HVMSG_X64_IO_PORT_INTERCEPT: + ret = handle_pio(cpu, msg); + if (ret < 0) { + return MshvVmExitSpecial; + } + return MshvVmExitIgnore; + default: + break; + } + + *exit = MshvVmExitIgnore; + return 0; } void mshv_remove_vcpu(int vm_fd, int cpu_fd) From efc4093358511a58846a409b965213aa1bb9f31a Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:43 +0200 Subject: [PATCH 29/35] accel/mshv: Handle overlapping mem mappings QEMU maps certain regions into the guest multiple times, as seen in the trace below. Currently the MSHV kernel driver will reject those mappings. To workaround this, a record is kept (a static global list of "slots", inspired by what the HVF accelerator has implemented). An overlapping region is not registered at the hypervisor, and marked as mapped=false. If there is an UNMAPPED_GPA exit, we can look for a slot that is unmapped and would cover the GPA. In this case we map out the conflicting slot and map in the requested region. mshv_set_phys_mem add=1 name=pc.bios mshv_map_memory => u_a=7ffff4e00000 gpa=00fffc0000 size=00040000 mshv_set_phys_mem add=1 name=ioapic mshv_set_phys_mem add=1 name=hpet mshv_set_phys_mem add=0 name=pc.ram mshv_unmap_memory u_a=7fff67e00000 gpa=0000000000 size=80000000 mshv_set_phys_mem add=1 name=pc.ram mshv_map_memory u_a=7fff67e00000 gpa=0000000000 size=000c0000 mshv_set_phys_mem add=1 name=pc.rom mshv_map_memory u_a=7ffff4c00000 gpa=00000c0000 size=00020000 mshv_set_phys_mem add=1 name=pc.bios mshv_remap_attempt => u_a=7ffff4e20000 gpa=00000e0000 size=00020000 The mapping table is guarded by a mutex for concurrent modification and RCU mechanisms for concurrent reads. Writes occur rarely, but we'll have to verify whether an unmapped region exist for each UNMAPPED_GPA exit, which happens frequently. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-24-magnuskulke@linux.microsoft.com [Fix format strings for trace-events; mshv.h/mshv_int.h split. - Paolo] Signed-off-by: Paolo Bonzini --- accel/mshv/mem.c | 406 +++++++++++++++++++++++++++++++++--- accel/mshv/mshv-all.c | 2 + accel/mshv/trace-events | 5 + include/system/mshv_int.h | 22 +- target/i386/mshv/mshv-cpu.c | 43 ++++ 5 files changed, 448 insertions(+), 30 deletions(-) diff --git a/accel/mshv/mem.c b/accel/mshv/mem.c index e55c38d4db..0e2164af3e 100644 --- a/accel/mshv/mem.c +++ b/accel/mshv/mem.c @@ -11,7 +11,9 @@ */ #include "qemu/osdep.h" +#include "qemu/lockable.h" #include "qemu/error-report.h" +#include "qemu/rcu.h" #include "linux/mshv.h" #include "system/address-spaces.h" #include "system/mshv.h" @@ -20,6 +22,137 @@ #include #include "trace.h" +typedef struct SlotsRCUReclaim { + struct rcu_head rcu; + GList *old_head; + MshvMemorySlot *removed_slot; +} SlotsRCUReclaim; + +static void rcu_reclaim_slotlist(struct rcu_head *rcu) +{ + SlotsRCUReclaim *r = container_of(rcu, SlotsRCUReclaim, rcu); + g_list_free(r->old_head); + g_free(r->removed_slot); + g_free(r); +} + +static void publish_slots(GList *new_head, GList *old_head, + MshvMemorySlot *removed_slot) +{ + MshvMemorySlotManager *manager = &mshv_state->msm; + + assert(manager); + qatomic_store_release(&manager->slots, new_head); + + SlotsRCUReclaim *r = g_new(SlotsRCUReclaim, 1); + r->old_head = old_head; + r->removed_slot = removed_slot; + + call_rcu1(&r->rcu, rcu_reclaim_slotlist); +} + +/* Needs to be called with mshv_state->msm.mutex held */ +static int remove_slot(MshvMemorySlot *slot) +{ + GList *old_head, *new_head; + MshvMemorySlotManager *manager = &mshv_state->msm; + + assert(manager); + old_head = qatomic_load_acquire(&manager->slots); + + if (!g_list_find(old_head, slot)) { + error_report("slot requested for removal not found"); + return -1; + } + + new_head = g_list_copy(old_head); + new_head = g_list_remove(new_head, slot); + manager->n_slots--; + + publish_slots(new_head, old_head, slot); + + return 0; +} + +/* Needs to be called with mshv_state->msm.mutex held */ +static MshvMemorySlot *append_slot(uint64_t gpa, uint64_t userspace_addr, + uint64_t size, bool readonly) +{ + GList *old_head, *new_head; + MshvMemorySlot *slot; + MshvMemorySlotManager *manager = &mshv_state->msm; + + assert(manager); + + old_head = qatomic_load_acquire(&manager->slots); + + if (manager->n_slots >= MSHV_MAX_MEM_SLOTS) { + error_report("no free memory slots available"); + return NULL; + } + + slot = g_new0(MshvMemorySlot, 1); + slot->guest_phys_addr = gpa; + slot->userspace_addr = userspace_addr; + slot->memory_size = size; + slot->readonly = readonly; + + new_head = g_list_copy(old_head); + new_head = g_list_append(new_head, slot); + manager->n_slots++; + + publish_slots(new_head, old_head, NULL); + + return slot; +} + +static int slot_overlaps(const MshvMemorySlot *slot1, + const MshvMemorySlot *slot2) +{ + uint64_t start_1 = slot1->userspace_addr, + start_2 = slot2->userspace_addr; + size_t len_1 = slot1->memory_size, + len_2 = slot2->memory_size; + + if (slot1 == slot2) { + return -1; + } + + return ranges_overlap(start_1, len_1, start_2, len_2) ? 0 : -1; +} + +static bool is_mapped(MshvMemorySlot *slot) +{ + /* Subsequent reads of mapped field see a fully-initialized slot */ + return qatomic_load_acquire(&slot->mapped); +} + +/* + * Find slot that is: + * - overlapping in userspace + * - currently mapped in the guest + * + * Needs to be called with mshv_state->msm.mutex or RCU read lock held. + */ +static MshvMemorySlot *find_overlap_mem_slot(GList *head, MshvMemorySlot *slot) +{ + GList *found; + MshvMemorySlot *overlap_slot; + + found = g_list_find_custom(head, slot, (GCompareFunc) slot_overlaps); + + if (!found) { + return NULL; + } + + overlap_slot = found->data; + if (!overlap_slot || !is_mapped(overlap_slot)) { + return NULL; + } + + return overlap_slot; +} + static int set_guest_memory(int vm_fd, const struct mshv_user_mem_region *region) { @@ -27,38 +160,169 @@ static int set_guest_memory(int vm_fd, ret = ioctl(vm_fd, MSHV_SET_GUEST_MEMORY, region); if (ret < 0) { - error_report("failed to set guest memory"); - return -errno; + error_report("failed to set guest memory: %s", strerror(errno)); + return -1; } return 0; } -static int map_or_unmap(int vm_fd, const MshvMemoryRegion *mr, bool map) +static int map_or_unmap(int vm_fd, const MshvMemorySlot *slot, bool map) { struct mshv_user_mem_region region = {0}; - region.guest_pfn = mr->guest_phys_addr >> MSHV_PAGE_SHIFT; - region.size = mr->memory_size; - region.userspace_addr = mr->userspace_addr; + region.guest_pfn = slot->guest_phys_addr >> MSHV_PAGE_SHIFT; + region.size = slot->memory_size; + region.userspace_addr = slot->userspace_addr; if (!map) { region.flags |= (1 << MSHV_SET_MEM_BIT_UNMAP); - trace_mshv_unmap_memory(mr->userspace_addr, mr->guest_phys_addr, - mr->memory_size); + trace_mshv_unmap_memory(slot->userspace_addr, slot->guest_phys_addr, + slot->memory_size); return set_guest_memory(vm_fd, ®ion); } region.flags = BIT(MSHV_SET_MEM_BIT_EXECUTABLE); - if (!mr->readonly) { + if (!slot->readonly) { region.flags |= BIT(MSHV_SET_MEM_BIT_WRITABLE); } - trace_mshv_map_memory(mr->userspace_addr, mr->guest_phys_addr, - mr->memory_size); + trace_mshv_map_memory(slot->userspace_addr, slot->guest_phys_addr, + slot->memory_size); return set_guest_memory(vm_fd, ®ion); } +static int slot_matches_region(const MshvMemorySlot *slot1, + const MshvMemorySlot *slot2) +{ + return (slot1->guest_phys_addr == slot2->guest_phys_addr && + slot1->userspace_addr == slot2->userspace_addr && + slot1->memory_size == slot2->memory_size) ? 0 : -1; +} + +/* Needs to be called with mshv_state->msm.mutex held */ +static MshvMemorySlot *find_mem_slot_by_region(uint64_t gpa, uint64_t size, + uint64_t userspace_addr) +{ + MshvMemorySlot ref_slot = { + .guest_phys_addr = gpa, + .userspace_addr = userspace_addr, + .memory_size = size, + }; + GList *found; + MshvMemorySlotManager *manager = &mshv_state->msm; + + assert(manager); + found = g_list_find_custom(manager->slots, &ref_slot, + (GCompareFunc) slot_matches_region); + + return found ? found->data : NULL; +} + +static int slot_covers_gpa(const MshvMemorySlot *slot, uint64_t *gpa_p) +{ + uint64_t gpa_offset, gpa = *gpa_p; + + gpa_offset = gpa - slot->guest_phys_addr; + return (slot->guest_phys_addr <= gpa && gpa_offset < slot->memory_size) + ? 0 : -1; +} + +/* Needs to be called with mshv_state->msm.mutex or RCU read lock held */ +static MshvMemorySlot *find_mem_slot_by_gpa(GList *head, uint64_t gpa) +{ + GList *found; + MshvMemorySlot *slot; + + trace_mshv_find_slot_by_gpa(gpa); + + found = g_list_find_custom(head, &gpa, (GCompareFunc) slot_covers_gpa); + if (found) { + slot = found->data; + trace_mshv_found_slot(slot->userspace_addr, slot->guest_phys_addr, + slot->memory_size); + return slot; + } + + return NULL; +} + +/* Needs to be called with mshv_state->msm.mutex held */ +static void set_mapped(MshvMemorySlot *slot, bool mapped) +{ + /* prior writes to mapped field becomes visible before readers see slot */ + qatomic_store_release(&slot->mapped, mapped); +} + +MshvRemapResult mshv_remap_overlap_region(int vm_fd, uint64_t gpa) +{ + MshvMemorySlot *gpa_slot, *overlap_slot; + GList *head; + int ret; + MshvMemorySlotManager *manager = &mshv_state->msm; + + /* fast path, called often by unmapped_gpa vm exit */ + WITH_RCU_READ_LOCK_GUARD() { + assert(manager); + head = qatomic_load_acquire(&manager->slots); + /* return early if no slot is found */ + gpa_slot = find_mem_slot_by_gpa(head, gpa); + if (gpa_slot == NULL) { + return MshvRemapNoMapping; + } + + /* return early if no overlapping slot is found */ + overlap_slot = find_overlap_mem_slot(head, gpa_slot); + if (overlap_slot == NULL) { + return MshvRemapNoOverlap; + } + } + + /* + * We'll modify the mapping list, so we need to upgrade to mutex and + * recheck. + */ + assert(manager); + QEMU_LOCK_GUARD(&manager->mutex); + + /* return early if no slot is found */ + gpa_slot = find_mem_slot_by_gpa(manager->slots, gpa); + if (gpa_slot == NULL) { + return MshvRemapNoMapping; + } + + /* return early if no overlapping slot is found */ + overlap_slot = find_overlap_mem_slot(manager->slots, gpa_slot); + if (overlap_slot == NULL) { + return MshvRemapNoOverlap; + } + + /* unmap overlapping slot */ + ret = map_or_unmap(vm_fd, overlap_slot, false); + if (ret < 0) { + error_report("failed to unmap overlap region"); + abort(); + } + set_mapped(overlap_slot, false); + warn_report("mapped out userspace_addr=0x%016lx gpa=0x%010lx size=0x%lx", + overlap_slot->userspace_addr, + overlap_slot->guest_phys_addr, + overlap_slot->memory_size); + + /* map region for gpa */ + ret = map_or_unmap(vm_fd, gpa_slot, true); + if (ret < 0) { + error_report("failed to map new region"); + abort(); + } + set_mapped(gpa_slot, true); + warn_report("mapped in userspace_addr=0x%016lx gpa=0x%010lx size=0x%lx", + gpa_slot->userspace_addr, gpa_slot->guest_phys_addr, + gpa_slot->memory_size); + + return MshvRemapOk; +} + static int handle_unmapped_mmio_region_read(uint64_t gpa, uint64_t size, uint8_t *data) { @@ -124,20 +388,97 @@ int mshv_guest_mem_write(uint64_t gpa, const uint8_t *data, uintptr_t size, return -1; } -static int set_memory(const MshvMemoryRegion *mshv_mr, bool add) +static int tracked_unmap(int vm_fd, uint64_t gpa, uint64_t size, + uint64_t userspace_addr) { - int ret = 0; + int ret; + MshvMemorySlot *slot; + MshvMemorySlotManager *manager = &mshv_state->msm; - if (!mshv_mr) { - error_report("Invalid mshv_mr"); + assert(manager); + + QEMU_LOCK_GUARD(&manager->mutex); + + slot = find_mem_slot_by_region(gpa, size, userspace_addr); + if (!slot) { + trace_mshv_skip_unset_mem(userspace_addr, gpa, size); + /* no work to do */ + return 0; + } + + if (!is_mapped(slot)) { + /* remove slot, no need to unmap */ + return remove_slot(slot); + } + + ret = map_or_unmap(vm_fd, slot, false); + if (ret < 0) { + error_report("failed to unmap memory region"); + return ret; + } + return remove_slot(slot); +} + +static int tracked_map(int vm_fd, uint64_t gpa, uint64_t size, bool readonly, + uint64_t userspace_addr) +{ + MshvMemorySlot *slot, *overlap_slot; + int ret; + MshvMemorySlotManager *manager = &mshv_state->msm; + + assert(manager); + + QEMU_LOCK_GUARD(&manager->mutex); + + slot = find_mem_slot_by_region(gpa, size, userspace_addr); + if (slot) { + error_report("memory region already mapped at gpa=0x%lx, " + "userspace_addr=0x%lx, size=0x%lx", + slot->guest_phys_addr, slot->userspace_addr, + slot->memory_size); return -1; } - trace_mshv_set_memory(add, mshv_mr->guest_phys_addr, - mshv_mr->memory_size, - mshv_mr->userspace_addr, mshv_mr->readonly, - ret); - return map_or_unmap(mshv_state->vm, mshv_mr, add); + slot = append_slot(gpa, userspace_addr, size, readonly); + + overlap_slot = find_overlap_mem_slot(manager->slots, slot); + if (overlap_slot) { + trace_mshv_remap_attempt(slot->userspace_addr, + slot->guest_phys_addr, + slot->memory_size); + warn_report("attempt to map region [0x%lx-0x%lx], while " + "[0x%lx-0x%lx] is already mapped in the guest", + userspace_addr, userspace_addr + size - 1, + overlap_slot->userspace_addr, + overlap_slot->userspace_addr + + overlap_slot->memory_size - 1); + + /* do not register mem slot in hv, but record for later swap-in */ + set_mapped(slot, false); + + return 0; + } + + ret = map_or_unmap(vm_fd, slot, true); + if (ret < 0) { + error_report("failed to map memory region"); + return -1; + } + set_mapped(slot, true); + + return 0; +} + +static int set_memory(uint64_t gpa, uint64_t size, bool readonly, + uint64_t userspace_addr, bool add) +{ + int vm_fd = mshv_state->vm; + + if (add) { + return tracked_map(vm_fd, gpa, size, readonly, userspace_addr); + } + + return tracked_unmap(vm_fd, gpa, size, userspace_addr); } /* @@ -173,7 +514,9 @@ void mshv_set_phys_mem(MshvMemoryListener *mml, MemoryRegionSection *section, bool writable = !area->readonly && !area->rom_device; hwaddr start_addr, mr_offset, size; void *ram; - MshvMemoryRegion mshv_mr = {0}; + + size = align_section(section, &start_addr); + trace_mshv_set_phys_mem(add, section->mr->name, start_addr); size = align_section(section, &start_addr); trace_mshv_set_phys_mem(add, section->mr->name, start_addr); @@ -200,14 +543,21 @@ void mshv_set_phys_mem(MshvMemoryListener *mml, MemoryRegionSection *section, ram = memory_region_get_ram_ptr(area) + mr_offset; - mshv_mr.guest_phys_addr = start_addr; - mshv_mr.memory_size = size; - mshv_mr.readonly = !writable; - mshv_mr.userspace_addr = (uint64_t)ram; - - ret = set_memory(&mshv_mr, add); + ret = set_memory(start_addr, size, !writable, (uint64_t)ram, add); if (ret < 0) { - error_report("Failed to set memory region"); + error_report("failed to set memory region"); abort(); } } + +void mshv_init_memory_slot_manager(MshvState *mshv_state) +{ + MshvMemorySlotManager *manager; + + assert(mshv_state); + manager = &mshv_state->msm; + + manager->n_slots = 0; + manager->slots = NULL; + qemu_mutex_init(&manager->mutex); +} diff --git a/accel/mshv/mshv-all.c b/accel/mshv/mshv-all.c index fa1f8f35bd..5edfcbad9d 100644 --- a/accel/mshv/mshv-all.c +++ b/accel/mshv/mshv-all.c @@ -437,6 +437,8 @@ static int mshv_init(AccelState *as, MachineState *ms) mshv_init_msicontrol(); + mshv_init_memory_slot_manager(s); + ret = create_vm(mshv_fd, &vm_fd); if (ret < 0) { close(mshv_fd); diff --git a/accel/mshv/trace-events b/accel/mshv/trace-events index a4dffeb24a..36f0d59b38 100644 --- a/accel/mshv/trace-events +++ b/accel/mshv/trace-events @@ -26,3 +26,8 @@ mshv_map_memory(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=0x% mshv_unmap_memory(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%08" PRIx64 mshv_set_phys_mem(bool add, const char *name, uint64_t gpa) "\tadd=%d name=%s gpa=0x%010" PRIx64 mshv_handle_mmio(uint64_t gva, uint64_t gpa, uint64_t size, uint8_t access_type) "\tgva=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%" PRIx64 " access_type=%d" + +mshv_found_slot(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%08" PRIx64 +mshv_skip_unset_mem(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%08" PRIx64 +mshv_remap_attempt(uint64_t userspace_addr, uint64_t gpa, uint64_t size) "\tu_a=0x%" PRIx64 " gpa=0x%010" PRIx64 " size=0x%08" PRIx64 +mshv_find_slot_by_gpa(uint64_t gpa) "\tgpa=0x%010" PRIx64 diff --git a/include/system/mshv_int.h b/include/system/mshv_int.h index b29d39911d..6350c69e9d 100644 --- a/include/system/mshv_int.h +++ b/include/system/mshv_int.h @@ -16,6 +16,8 @@ #define MSHV_MSR_ENTRIES_COUNT 64 +#define MSHV_MAX_MEM_SLOTS 32 + typedef struct hyperv_message hv_message; struct AccelCPUState { @@ -33,6 +35,12 @@ typedef struct MshvAddressSpace { AddressSpace *as; } MshvAddressSpace; +typedef struct MshvMemorySlotManager { + size_t n_slots; + GList *slots; + QemuMutex mutex; +} MshvMemorySlotManager; + struct MshvState { AccelState parent_obj; int vm; @@ -41,6 +49,7 @@ struct MshvState { int nr_as; MshvAddressSpace *as; int fd; + MshvMemorySlotManager msm; }; typedef struct MshvMsiControl { @@ -71,6 +80,12 @@ typedef enum MshvVmExit { MshvVmExitSpecial = 2, } MshvVmExit; +typedef enum MshvRemapResult { + MshvRemapOk = 0, + MshvRemapNoMapping = 1, + MshvRemapNoOverlap = 2, +} MshvRemapResult; + void mshv_init_mmio_emu(void); int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd); void mshv_remove_vcpu(int vm_fd, int cpu_fd); @@ -94,19 +109,22 @@ int mshv_hvcall(int fd, const struct mshv_root_hvcall *args); #endif /* memory */ -typedef struct MshvMemoryRegion { +typedef struct MshvMemorySlot { uint64_t guest_phys_addr; uint64_t memory_size; uint64_t userspace_addr; bool readonly; -} MshvMemoryRegion; + bool mapped; +} MshvMemorySlot; +MshvRemapResult mshv_remap_overlap_region(int vm_fd, uint64_t gpa); int mshv_guest_mem_read(uint64_t gpa, uint8_t *data, uintptr_t size, bool is_secure_mode, bool instruction_fetch); int mshv_guest_mem_write(uint64_t gpa, const uint8_t *data, uintptr_t size, bool is_secure_mode); void mshv_set_phys_mem(MshvMemoryListener *mml, MemoryRegionSection *section, bool add); +void mshv_init_memory_slot_manager(MshvState *mshv_state); /* msr */ typedef struct MshvMsrEntry { diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c index 7edc032cea..de87142bff 100644 --- a/target/i386/mshv/mshv-cpu.c +++ b/target/i386/mshv/mshv-cpu.c @@ -1170,6 +1170,43 @@ static int handle_mmio(CPUState *cpu, const struct hyperv_message *msg, return 0; } +static int handle_unmapped_mem(int vm_fd, CPUState *cpu, + const struct hyperv_message *msg, + MshvVmExit *exit_reason) +{ + struct hv_x64_memory_intercept_message info = { 0 }; + uint64_t gpa; + int ret; + enum MshvRemapResult remap_result; + + ret = set_memory_info(msg, &info); + if (ret < 0) { + error_report("failed to convert message to memory info"); + return -1; + } + + gpa = info.guest_physical_address; + + /* attempt to remap the region, in case of overlapping userspace mappings */ + remap_result = mshv_remap_overlap_region(vm_fd, gpa); + *exit_reason = MshvVmExitIgnore; + + switch (remap_result) { + case MshvRemapNoMapping: + /* if we didn't find a mapping, it is probably mmio */ + return handle_mmio(cpu, msg, exit_reason); + case MshvRemapOk: + break; + case MshvRemapNoOverlap: + /* This should not happen, but we are forgiving it */ + warn_report("found no overlap for unmapped region"); + *exit_reason = MshvVmExitSpecial; + break; + } + + return 0; +} + static int set_ioport_info(const struct hyperv_message *msg, hv_x64_io_port_intercept_message *info) { @@ -1507,6 +1544,12 @@ int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit) case HVMSG_UNRECOVERABLE_EXCEPTION: return MshvVmExitShutdown; case HVMSG_UNMAPPED_GPA: + ret = handle_unmapped_mem(vm_fd, cpu, msg, &exit_reason); + if (ret < 0) { + error_report("failed to handle unmapped memory"); + return -1; + } + return exit_reason; case HVMSG_GPA_INTERCEPT: ret = handle_mmio(cpu, msg, &exit_reason); if (ret < 0) { From e7b08dfb902430b4f8226d23d7cf9b2762b6fc83 Mon Sep 17 00:00:00 2001 From: Praveen K Paladugu Date: Tue, 16 Sep 2025 18:48:44 +0200 Subject: [PATCH 30/35] qapi/accel: Allow to query mshv capabilities Allow to query mshv capabilities via query-mshv QMP and info mshv HMP commands. Signed-off-by: Magnus Kulke Acked-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20250916164847.77883-25-magnuskulke@linux.microsoft.com [Fix "since" version. - Paolo] Signed-off-by: Paolo Bonzini --- hmp-commands-info.hx | 13 +++++++++++++ hw/core/machine-hmp-cmds.c | 15 +++++++++++++++ hw/core/machine-qmp-cmds.c | 14 ++++++++++++++ include/monitor/hmp.h | 1 + include/system/hw_accel.h | 1 + qapi/accelerator.json | 29 +++++++++++++++++++++++++++++ 6 files changed, 73 insertions(+) diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx index 6142f60e7b..eaaa880c1b 100644 --- a/hmp-commands-info.hx +++ b/hmp-commands-info.hx @@ -307,6 +307,19 @@ SRST Show KVM information. ERST + { + .name = "mshv", + .args_type = "", + .params = "", + .help = "show MSHV information", + .cmd = hmp_info_mshv, + }, + +SRST + ``info mshv`` + Show MSHV information. +ERST + { .name = "numa", .args_type = "", diff --git a/hw/core/machine-hmp-cmds.c b/hw/core/machine-hmp-cmds.c index 3a612e2232..682ed9f49b 100644 --- a/hw/core/machine-hmp-cmds.c +++ b/hw/core/machine-hmp-cmds.c @@ -163,6 +163,21 @@ void hmp_info_kvm(Monitor *mon, const QDict *qdict) qapi_free_KvmInfo(info); } +void hmp_info_mshv(Monitor *mon, const QDict *qdict) +{ + MshvInfo *info; + + info = qmp_query_mshv(NULL); + monitor_printf(mon, "mshv support: "); + if (info->present) { + monitor_printf(mon, "%s\n", info->enabled ? "enabled" : "disabled"); + } else { + monitor_printf(mon, "not compiled\n"); + } + + qapi_free_MshvInfo(info); +} + void hmp_info_uuid(Monitor *mon, const QDict *qdict) { UuidInfo *info; diff --git a/hw/core/machine-qmp-cmds.c b/hw/core/machine-qmp-cmds.c index 6aca1a626e..e24bf0d97b 100644 --- a/hw/core/machine-qmp-cmds.c +++ b/hw/core/machine-qmp-cmds.c @@ -28,6 +28,20 @@ #include "system/runstate.h" #include "system/system.h" #include "hw/s390x/storage-keys.h" +#include + +/* + * QMP query for MSHV + */ +MshvInfo *qmp_query_mshv(Error **errp) +{ + MshvInfo *info = g_malloc0(sizeof(*info)); + + info->enabled = mshv_enabled(); + info->present = accel_find("mshv"); + + return info; +} /* * fast means: we NEVER interrupt vCPU threads to retrieve diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h index ae116d9804..31bd812e5f 100644 --- a/include/monitor/hmp.h +++ b/include/monitor/hmp.h @@ -24,6 +24,7 @@ strList *hmp_split_at_comma(const char *str); void hmp_info_name(Monitor *mon, const QDict *qdict); void hmp_info_version(Monitor *mon, const QDict *qdict); void hmp_info_kvm(Monitor *mon, const QDict *qdict); +void hmp_info_mshv(Monitor *mon, const QDict *qdict); void hmp_info_status(Monitor *mon, const QDict *qdict); void hmp_info_uuid(Monitor *mon, const QDict *qdict); void hmp_info_chardev(Monitor *mon, const QDict *qdict); diff --git a/include/system/hw_accel.h b/include/system/hw_accel.h index fa9228d5d2..55497edc29 100644 --- a/include/system/hw_accel.h +++ b/include/system/hw_accel.h @@ -14,6 +14,7 @@ #include "hw/core/cpu.h" #include "system/kvm.h" #include "system/hvf.h" +#include "system/mshv.h" #include "system/whpx.h" #include "system/nvmm.h" diff --git a/qapi/accelerator.json b/qapi/accelerator.json index fb28c8d920..664e027246 100644 --- a/qapi/accelerator.json +++ b/qapi/accelerator.json @@ -54,3 +54,32 @@ { 'command': 'x-accel-stats', 'returns': 'HumanReadableText', 'features': [ 'unstable' ] } + +## +# @MshvInfo: +# +# Information about support for MSHV acceleration +# +# @enabled: true if MSHV acceleration is active +# +# @present: true if MSHV acceleration is built into this executable +# +# Since: 10.2.0 +## +{ 'struct': 'MshvInfo', 'data': {'enabled': 'bool', 'present': 'bool'} } + +## +# @query-mshv: +# +# Return information about MSHV acceleration +# +# Returns: @MshvInfo +# +# Since: 10.0.92 +# +# .. qmp-example:: +# +# -> { "execute": "query-mshv" } +# <- { "return": { "enabled": true, "present": true } } +## +{ 'command': 'query-mshv', 'returns': 'MshvInfo' } From e4a20afce59937073298d716cc829bdc026542dc Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Thu, 2 Oct 2025 09:50:12 +0200 Subject: [PATCH 31/35] target/i386/mshv: Use preallocated page for hvcall There are hvcalls that are invoked during MMIO exits, the payload is of dynamic size. To avoid heap allocations we can use preallocated pages as in/out buffer for those calls. A page is reserved per vCPU and used for set/get register hv calls. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-26-magnuskulke@linux.microsoft.com [Use standard MAX_CONST macro; mshv.h/mshv_int.h split. - Paolo] Signed-off-by: Paolo Bonzini --- accel/mshv/mshv-all.c | 2 +- include/system/mshv_int.h | 7 +++++++ target/i386/mshv/mshv-cpu.c | 38 +++++++++++++++++++++++++------------ 3 files changed, 34 insertions(+), 13 deletions(-) diff --git a/accel/mshv/mshv-all.c b/accel/mshv/mshv-all.c index 5edfcbad9d..45174f7c4e 100644 --- a/accel/mshv/mshv-all.c +++ b/accel/mshv/mshv-all.c @@ -399,8 +399,8 @@ static int mshv_init_vcpu(CPUState *cpu) uint8_t vp_index = cpu->cpu_index; int ret; - mshv_arch_init_vcpu(cpu); cpu->accel = g_new0(AccelCPUState, 1); + mshv_arch_init_vcpu(cpu); ret = mshv_create_vcpu(vm_fd, vp_index, &cpu->accel->cpufd); if (ret < 0) { diff --git a/include/system/mshv_int.h b/include/system/mshv_int.h index 6350c69e9d..490563c1ab 100644 --- a/include/system/mshv_int.h +++ b/include/system/mshv_int.h @@ -20,9 +20,16 @@ typedef struct hyperv_message hv_message; +typedef struct MshvHvCallArgs { + void *base; + void *input_page; + void *output_page; +} MshvHvCallArgs; + struct AccelCPUState { int cpufd; bool dirty; + MshvHvCallArgs hvcall_args; }; typedef struct MshvMemoryListener { diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c index de87142bff..1f7b9cb37e 100644 --- a/target/i386/mshv/mshv-cpu.c +++ b/target/i386/mshv/mshv-cpu.c @@ -34,6 +34,10 @@ #include +#define MAX_REGISTER_COUNT (MAX_CONST(ARRAY_SIZE(STANDARD_REGISTER_NAMES), \ + MAX_CONST(ARRAY_SIZE(SPECIAL_REGISTER_NAMES), \ + ARRAY_SIZE(FPU_REGISTER_NAMES)))) + static enum hv_register_name STANDARD_REGISTER_NAMES[18] = { HV_X64_REGISTER_RAX, HV_X64_REGISTER_RBX, @@ -151,7 +155,7 @@ int mshv_set_generic_regs(const CPUState *cpu, const hv_register_assoc *assocs, int cpu_fd = mshv_vcpufd(cpu); int vp_index = cpu->cpu_index; size_t in_sz, assocs_sz; - hv_input_set_vp_registers *in; + hv_input_set_vp_registers *in = cpu->accel->hvcall_args.input_page; struct mshv_root_hvcall args = {0}; int ret; @@ -160,7 +164,7 @@ int mshv_set_generic_regs(const CPUState *cpu, const hv_register_assoc *assocs, in_sz = sizeof(hv_input_set_vp_registers) + assocs_sz; /* fill the input struct */ - in = g_malloc0(in_sz); + memset(in, 0, sizeof(hv_input_set_vp_registers)); in->vp_index = vp_index; memcpy(in->elements, assocs, assocs_sz); @@ -172,7 +176,6 @@ int mshv_set_generic_regs(const CPUState *cpu, const hv_register_assoc *assocs, /* perform the call */ ret = mshv_hvcall(cpu_fd, &args); - g_free(in); if (ret < 0) { error_report("Failed to set registers"); return -1; @@ -193,8 +196,8 @@ static int get_generic_regs(CPUState *cpu, hv_register_assoc *assocs, { int cpu_fd = mshv_vcpufd(cpu); int vp_index = cpu->cpu_index; - hv_input_get_vp_registers *in; - hv_register_value *values; + hv_input_get_vp_registers *in = cpu->accel->hvcall_args.input_page; + hv_register_value *values = cpu->accel->hvcall_args.output_page; size_t in_sz, names_sz, values_sz; int i, ret; struct mshv_root_hvcall args = {0}; @@ -204,15 +207,14 @@ static int get_generic_regs(CPUState *cpu, hv_register_assoc *assocs, in_sz = sizeof(hv_input_get_vp_registers) + names_sz; /* fill the input struct */ - in = g_malloc0(in_sz); + memset(in, 0, sizeof(hv_input_get_vp_registers)); in->vp_index = vp_index; for (i = 0; i < n_regs; i++) { in->names[i] = assocs[i].name; } - /* allocate value output buffer */ + /* determine size of value output buffer */ values_sz = n_regs * sizeof(union hv_register_value); - values = g_malloc0(values_sz); /* create the hvcall envelope */ args.code = HVCALL_GET_VP_REGISTERS; @@ -224,16 +226,13 @@ static int get_generic_regs(CPUState *cpu, hv_register_assoc *assocs, /* perform the call */ ret = mshv_hvcall(cpu_fd, &args); - g_free(in); if (ret < 0) { - g_free(values); error_report("Failed to retrieve registers"); return -1; } /* assert we got all registers */ if (args.reps != n_regs) { - g_free(values); error_report("Failed to retrieve registers: expected %zu elements" ", got %u", n_regs, args.reps); return -1; @@ -243,7 +242,6 @@ static int get_generic_regs(CPUState *cpu, hv_register_assoc *assocs, for (i = 0; i < n_regs; i++) { assocs[i].value = values[i]; } - g_free(values); return 0; } @@ -1696,6 +1694,19 @@ void mshv_arch_init_vcpu(CPUState *cpu) { X86CPU *x86_cpu = X86_CPU(cpu); CPUX86State *env = &x86_cpu->env; + AccelCPUState *state = cpu->accel; + size_t page = HV_HYP_PAGE_SIZE; + void *mem = qemu_memalign(page, 2 * page); + + /* sanity check, to make sure we don't overflow the page */ + QEMU_BUILD_BUG_ON((MAX_REGISTER_COUNT + * sizeof(hv_register_assoc) + + sizeof(hv_input_get_vp_registers) + > HV_HYP_PAGE_SIZE)); + + state->hvcall_args.base = mem; + state->hvcall_args.input_page = mem; + state->hvcall_args.output_page = (uint8_t *)mem + page; env->emu_mmio_buf = g_new(char, 4096); } @@ -1704,7 +1715,10 @@ void mshv_arch_destroy_vcpu(CPUState *cpu) { X86CPU *x86_cpu = X86_CPU(cpu); CPUX86State *env = &x86_cpu->env; + AccelCPUState *state = cpu->accel; + g_free(state->hvcall_args.base); + state->hvcall_args = (MshvHvCallArgs){0}; g_clear_pointer(&env->emu_mmio_buf, g_free); } From 3af71a1a6a7a497df7d3026239d6136b56e3d5ab Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:46 +0200 Subject: [PATCH 32/35] docs: Add mshv to documentation Added mshv to the list of accelerators in doc text. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-27-magnuskulke@linux.microsoft.com Signed-off-by: Paolo Bonzini --- docs/about/build-platforms.rst | 2 +- docs/devel/codebase.rst | 2 +- docs/glossary.rst | 7 +++---- docs/system/introduction.rst | 3 +++ qemu-options.hx | 16 ++++++++-------- 5 files changed, 16 insertions(+), 14 deletions(-) diff --git a/docs/about/build-platforms.rst b/docs/about/build-platforms.rst index 798cb4631d..fc2743658d 100644 --- a/docs/about/build-platforms.rst +++ b/docs/about/build-platforms.rst @@ -53,7 +53,7 @@ Those hosts are officially supported, with various accelerators: * - SPARC - tcg * - x86 - - hvf (64 bit only), kvm, nvmm, tcg, whpx (64 bit only), xen + - hvf (64 bit only), mshv (64 bit only), kvm, nvmm, tcg, whpx (64 bit only), xen Other host architectures are not supported. It is possible to build QEMU system emulation on an unsupported host architecture using the configure diff --git a/docs/devel/codebase.rst b/docs/devel/codebase.rst index 2a3143787a..69d8827117 100644 --- a/docs/devel/codebase.rst +++ b/docs/devel/codebase.rst @@ -48,7 +48,7 @@ yet, so sometimes the source code is all you have. * `accel `_: Infrastructure and architecture agnostic code related to the various `accelerators ` supported by QEMU - (TCG, KVM, hvf, whpx, xen, nvmm). + (TCG, KVM, hvf, whpx, xen, nvmm, mshv). Contains interfaces for operations that will be implemented per `target `_. * `audio `_: diff --git a/docs/glossary.rst b/docs/glossary.rst index 4fa044bfb6..2857731bc4 100644 --- a/docs/glossary.rst +++ b/docs/glossary.rst @@ -12,7 +12,7 @@ Accelerator A specific API used to accelerate execution of guest instructions. It can be hardware-based, through a virtualization API provided by the host OS (kvm, hvf, -whpx, ...), or software-based (tcg). See this description of `supported +whpx, mshv, ...), or software-based (tcg). See this description of `supported accelerators`. Board @@ -101,9 +101,8 @@ manage a virtual machine. QEMU is a virtualizer, that interacts with various hypervisors. In the context of QEMU, an hypervisor is an API, provided by the Host OS, -allowing to execute virtual machines. Linux implementation is KVM (and supports -Xen as well). For MacOS, it's HVF. Windows defines WHPX. And NetBSD provides -NVMM. +allowing to execute virtual machines. Linux provides a choice of KVM, Xen +or MSHV; MacOS provides HVF; Windows provides WHPX; NetBSD provides NVMM. .. _machine: diff --git a/docs/system/introduction.rst b/docs/system/introduction.rst index 4cd46b5b8f..9c57523b6c 100644 --- a/docs/system/introduction.rst +++ b/docs/system/introduction.rst @@ -23,6 +23,9 @@ Tiny Code Generator (TCG) capable of emulating many CPUs. * - Xen - Linux (as dom0) - Arm, x86 + * - MSHV + - Linux (as dom0) + - x86 * - Hypervisor Framework (hvf) - MacOS - x86 (64 bit only), Arm (64 bit only) diff --git a/qemu-options.hx b/qemu-options.hx index 075f4be2e3..56db4bf9e5 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -28,7 +28,7 @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \ "-machine [type=]name[,prop[=value][,...]]\n" " selects emulated machine ('-machine help' for list)\n" " property accel=accel1[:accel2[:...]] selects accelerator\n" - " supported accelerators are kvm, xen, hvf, nvmm, whpx or tcg (default: tcg)\n" + " supported accelerators are kvm, xen, hvf, nvmm, whpx, mshv or tcg (default: tcg)\n" " vmport=on|off|auto controls emulation of vmport (default: auto)\n" " dump-guest-core=on|off include guest memory in a core dump (default=on)\n" " mem-merge=on|off controls memory merge support (default: on)\n" @@ -66,10 +66,10 @@ SRST ``accel=accels1[:accels2[:...]]`` This is used to enable an accelerator. Depending on the target - architecture, kvm, xen, hvf, nvmm, whpx or tcg can be available. - By default, tcg is used. If there is more than one accelerator - specified, the next one is used if the previous one fails to - initialize. + architecture, kvm, xen, hvf, nvmm, whpx, mshv or tcg can be + available. By default, tcg is used. If there is more than one + accelerator specified, the next one is used if the previous one + fails to initialize. ``vmport=on|off|auto`` Enables emulation of VMWare IO port, for vmmouse etc. auto says @@ -226,7 +226,7 @@ ERST DEF("accel", HAS_ARG, QEMU_OPTION_accel, "-accel [accel=]accelerator[,prop[=value][,...]]\n" - " select accelerator (kvm, xen, hvf, nvmm, whpx or tcg; use 'help' for a list)\n" + " select accelerator (kvm, xen, hvf, nvmm, whpx, mshv or tcg; use 'help' for a list)\n" " igd-passthru=on|off (enable Xen integrated Intel graphics passthrough, default=off)\n" " kernel-irqchip=on|off|split controls accelerated irqchip support (default=on)\n" " kvm-shadow-mem=size of KVM shadow MMU in bytes\n" @@ -241,8 +241,8 @@ DEF("accel", HAS_ARG, QEMU_OPTION_accel, SRST ``-accel name[,prop=value[,...]]`` This is used to enable an accelerator. Depending on the target - architecture, kvm, xen, hvf, nvmm, whpx or tcg can be available. By - default, tcg is used. If there is more than one accelerator + architecture, kvm, xen, hvf, nvmm, whpx, mshv or tcg can be available. + By default, tcg is used. If there is more than one accelerator specified, the next one is used if the previous one fails to initialize. From 1872bd9f2dad5113b0c27d352ec49683e0953c7f Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Tue, 16 Sep 2025 18:48:47 +0200 Subject: [PATCH 33/35] MAINTAINERS: Add maintainers for mshv accelerator Adding Magnus Kulke and Wei Liu to the maintainers file for the respective folders/files. Signed-off-by: Magnus Kulke Link: https://lore.kernel.org/r/20250916164847.77883-28-magnuskulke@linux.microsoft.com [Rename "MAHV CPUs" to mention x86. - Paolo] Signed-off-by: Paolo Bonzini --- MAINTAINERS | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 406cef88f0..645344c4fa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -551,6 +551,21 @@ F: target/i386/whpx/ F: accel/stubs/whpx-stub.c F: include/system/whpx.h +MSHV +M: Magnus Kulke +R: Wei Liu +S: Supported +F: accel/mshv/ +F: include/system/mshv.h +F: include/hw/hyperv/hvgdk*.h +F: include/hw/hyperv/hvhdk*.h + +X86 MSHV CPUs +M: Magnus Kulke +R: Wei Liu +S: Supported +F: target/i386/mshv/ + X86 Instruction Emulator M: Cameron Esfahani M: Roman Bolshakov From 1a583ca382f63be537977e76bf54a0670e57a2a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Tue, 7 Oct 2025 19:34:05 +0400 Subject: [PATCH 34/35] tests/docker: make --enable-rust overridable with EXTRA_CONFIGURE_OPTS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marc-André Lureau Link: https://lore.kernel.org/r/20251007153406.421032-1-marcandre.lureau@redhat.com Signed-off-by: Paolo Bonzini --- tests/docker/common.rc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/docker/common.rc b/tests/docker/common.rc index 752f4f3aed..79d533ab2e 100755 --- a/tests/docker/common.rc +++ b/tests/docker/common.rc @@ -53,8 +53,8 @@ configure_qemu() config_opts="--enable-werror \ ${TARGET_LIST:+--target-list=${TARGET_LIST}} \ --prefix=$INSTALL_DIR \ - $QEMU_CONFIGURE_OPTS $EXTRA_CONFIGURE_OPTS \ $enable_rust \ + $QEMU_CONFIGURE_OPTS $EXTRA_CONFIGURE_OPTS \ $@" echo "Configure options:" echo $config_opts From b9ef6198d709f431d893d1b5525cdf7fd7a3e11b Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 7 Oct 2025 15:44:27 -0400 Subject: [PATCH 35/35] rust: fix path to rust_root_crate.sh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generated Rust root crate source files contain the wrong path to the rust_root_crate.sh script. Signed-off-by: Stefan Hajnoczi Reviewed-by: Manos Pitsidianakis Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20251007194427.118871-1-stefanha@redhat.com Signed-off-by: Paolo Bonzini --- scripts/rust/rust_root_crate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/rust/rust_root_crate.sh b/scripts/rust/rust_root_crate.sh index 975bddf7f1..f05b8d0210 100755 --- a/scripts/rust/rust_root_crate.sh +++ b/scripts/rust/rust_root_crate.sh @@ -4,7 +4,7 @@ set -eu cat <