From 112b55f0b012d00a50befc8e1aa4abe350f00b01 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 7 Oct 2025 15:42:13 -0300 Subject: [PATCH 01/36] migration/savevm: Add a compatibility check for capabilities It has always been possible to enable arbitrary migration capabilities and attempt to take a snapshot of the VM with the savevm/loadvm commands as well as their QMP counterparts snapshot-save/snapshot-load. Most migration capabilities are not meant to be used with snapshots and there's a risk of crashing QEMU or producing incorrect behavior. Ideally, every migration capability would either be implemented for savevm or explicitly rejected. Add a compatibility check routine and reject the snapshot command if an incompatible capability is enabled. For now only act on the the two that actually cause a crash: multifd and mapped-ram. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2881 Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20251007184213.5990-1-farosas@suse.de Signed-off-by: Peter Xu --- migration/options.c | 27 +++++++++++++++++++++++++++ migration/options.h | 1 + migration/savevm.c | 8 ++++++++ 3 files changed, 36 insertions(+) diff --git a/migration/options.c b/migration/options.c index 5183112775..d9227809d7 100644 --- a/migration/options.c +++ b/migration/options.c @@ -445,11 +445,38 @@ INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot, MIGRATION_CAPABILITY_VALIDATE_UUID, MIGRATION_CAPABILITY_ZERO_COPY_SEND); +/* Snapshot compatibility check list */ +static const +INITIALIZE_MIGRATE_CAPS_SET(check_caps_savevm, + MIGRATION_CAPABILITY_MULTIFD, + MIGRATION_CAPABILITY_MAPPED_RAM, +); + static bool migrate_incoming_started(void) { return !!migration_incoming_get_current()->transport_data; } +bool migrate_can_snapshot(Error **errp) +{ + MigrationState *s = migrate_get_current(); + int i; + + for (i = 0; i < check_caps_savevm.size; i++) { + int incomp_cap = check_caps_savevm.caps[i]; + + if (s->capabilities[incomp_cap]) { + error_setg(errp, + "Snapshots are not compatible with %s", + MigrationCapability_str(incomp_cap)); + return false; + } + } + + return true; +} + + bool migrate_rdma_caps_check(bool *caps, Error **errp) { if (caps[MIGRATION_CAPABILITY_XBZRLE]) { diff --git a/migration/options.h b/migration/options.h index 82d839709e..a7b3262d1e 100644 --- a/migration/options.h +++ b/migration/options.h @@ -59,6 +59,7 @@ bool migrate_tls(void); bool migrate_rdma_caps_check(bool *caps, Error **errp); bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp); +bool migrate_can_snapshot(Error **errp); /* parameters */ diff --git a/migration/savevm.c b/migration/savevm.c index 7b35ec4dd0..aafa40d779 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -3322,6 +3322,10 @@ bool save_snapshot(const char *name, bool overwrite, const char *vmstate, GLOBAL_STATE_CODE(); + if (!migrate_can_snapshot(errp)) { + return false; + } + if (migration_is_blocked(errp)) { return false; } @@ -3507,6 +3511,10 @@ bool load_snapshot(const char *name, const char *vmstate, int ret; MigrationIncomingState *mis = migration_incoming_get_current(); + if (!migrate_can_snapshot(errp)) { + return false; + } + if (!bdrv_all_can_snapshot(has_devices, devices, errp)) { return false; } From 1a4922a96174c2c4e07d9d4f15bde3500e1520a4 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 10 Oct 2025 05:16:40 -0700 Subject: [PATCH 02/36] MAINTAINERS: update cpr reviewers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update cpr reviewers. Some of these files overlap with migration files, but some do not. Signed-off-by: Steve Sistare Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/1760098600-399192-1-git-send-email-steven.sistare@oracle.com Signed-off-by: Peter Xu --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index ee058e2fef..12ec2d6860 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3114,7 +3114,8 @@ T: git https://gitlab.com/jsnow/qemu.git jobs T: git https://gitlab.com/vsementsov/qemu.git block CheckPoint and Restart (CPR) -R: Steve Sistare +R: Peter Xu +R: Fabiano Rosas S: Supported F: hw/vfio/cpr* F: include/hw/vfio/vfio-cpr.h From e5423828d69e90a8fc9db90d8a0d7cf916370fe0 Mon Sep 17 00:00:00 2001 From: Marco Cavenati Date: Wed, 1 Oct 2025 18:18:22 +0200 Subject: [PATCH 03/36] migration/ram: fix docs of ram_handle_zero Remove outdated 'ch' parameter from the function documentation. Signed-off-by: Marco Cavenati Reviewed-by: Juraj Marcin Link: https://lore.kernel.org/r/20251001161823.2032399-3-Marco.Cavenati@eurecom.fr Signed-off-by: Peter Xu --- migration/ram.c | 1 - 1 file changed, 1 deletion(-) diff --git a/migration/ram.c b/migration/ram.c index 5eef2efc78..1384748193 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -3630,7 +3630,6 @@ static inline void *colo_cache_from_block_offset(RAMBlock *block, * determined to be zero, then zap it. * * @host: host address for the zero page - * @ch: what the page is filled from. We only support zero * @size: size of the zero page */ void ram_handle_zero(void *host, uint64_t size) From 04a191cb36e71f4bb44d28af2b56b9624b36b0a1 Mon Sep 17 00:00:00 2001 From: Marco Cavenati Date: Fri, 10 Oct 2025 13:59:53 +0200 Subject: [PATCH 04/36] migration: add FEATURE_SEEKABLE to QIOChannelBlock Enable the use of the mapped-ram migration feature with savevm/loadvm snapshots by adding the QIO_CHANNEL_FEATURE_SEEKABLE feature to QIOChannelBlock. Implement io_preadv and io_pwritev methods to provide positioned I/O capabilities that don't modify the channel's position pointer. Signed-off-by: Marco Cavenati Link: https://lore.kernel.org/r/20251010115954.1995298-2-Marco.Cavenati@eurecom.fr Signed-off-by: Peter Xu --- migration/channel-block.c | 44 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/migration/channel-block.c b/migration/channel-block.c index 97de5a691b..9cf383d4b5 100644 --- a/migration/channel-block.c +++ b/migration/channel-block.c @@ -30,6 +30,7 @@ qio_channel_block_new(BlockDriverState *bs) QIOChannelBlock *ioc; ioc = QIO_CHANNEL_BLOCK(object_new(TYPE_QIO_CHANNEL_BLOCK)); + qio_channel_set_feature(QIO_CHANNEL(ioc), QIO_CHANNEL_FEATURE_SEEKABLE); bdrv_ref(bs); ioc->bs = bs; @@ -96,6 +97,47 @@ qio_channel_block_writev(QIOChannel *ioc, return qiov.size; } +static ssize_t +qio_channel_block_preadv(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + off_t offset, + Error **errp) +{ + QIOChannelBlock *bioc = QIO_CHANNEL_BLOCK(ioc); + QEMUIOVector qiov; + int ret; + + qemu_iovec_init_external(&qiov, (struct iovec *)iov, niov); + ret = bdrv_readv_vmstate(bioc->bs, &qiov, offset); + if (ret < 0) { + error_setg_errno(errp, -ret, "bdrv_readv_vmstate failed"); + return -1; + } + + return qiov.size; +} + +static ssize_t +qio_channel_block_pwritev(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + off_t offset, + Error **errp) +{ + QIOChannelBlock *bioc = QIO_CHANNEL_BLOCK(ioc); + QEMUIOVector qiov; + int ret; + + qemu_iovec_init_external(&qiov, (struct iovec *)iov, niov); + ret = bdrv_writev_vmstate(bioc->bs, &qiov, offset); + if (ret < 0) { + error_setg_errno(errp, -ret, "bdrv_writev_vmstate failed"); + return -1; + } + + return qiov.size; +} static int qio_channel_block_set_blocking(QIOChannel *ioc, @@ -177,6 +219,8 @@ qio_channel_block_class_init(ObjectClass *klass, ioc_klass->io_writev = qio_channel_block_writev; ioc_klass->io_readv = qio_channel_block_readv; ioc_klass->io_set_blocking = qio_channel_block_set_blocking; + ioc_klass->io_preadv = qio_channel_block_preadv; + ioc_klass->io_pwritev = qio_channel_block_pwritev; ioc_klass->io_seek = qio_channel_block_seek; ioc_klass->io_close = qio_channel_block_close; ioc_klass->io_set_aio_fd_handler = qio_channel_block_set_aio_fd_handler; From 0ecd28582477e80f867ea776eff34315b048d928 Mon Sep 17 00:00:00 2001 From: Marco Cavenati Date: Fri, 10 Oct 2025 13:59:54 +0200 Subject: [PATCH 05/36] migration: mapped-ram: handle zero pages Make mapped-ram compatible with loadvm snapshot restoring by explicitly zeroing memory pages in this case. Skip zeroing for -incoming and -loadvm migrations to preserve performance. Signed-off-by: Marco Cavenati Link: https://lore.kernel.org/r/20251010115954.1995298-3-Marco.Cavenati@eurecom.fr Signed-off-by: Peter Xu --- migration/options.c | 1 - migration/ram.c | 59 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/migration/options.c b/migration/options.c index d9227809d7..e78324b80c 100644 --- a/migration/options.c +++ b/migration/options.c @@ -449,7 +449,6 @@ INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot, static const INITIALIZE_MIGRATE_CAPS_SET(check_caps_savevm, MIGRATION_CAPABILITY_MULTIFD, - MIGRATION_CAPABILITY_MAPPED_RAM, ); static bool migrate_incoming_started(void) diff --git a/migration/ram.c b/migration/ram.c index 1384748193..29f016cb25 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -4038,12 +4038,58 @@ static size_t ram_load_multifd_pages(void *host_addr, size_t size, return size; } +/** + * handle_zero_mapped_ram: Zero out a range of RAM pages if required during + * mapped-ram load + * + * Zeroing is only performed when restoring from a snapshot (HMP loadvm). + * During incoming migration or -loadvm cli snapshot load, the function is a + * no-op and returns true as in those cases the pages are already guaranteed to + * be zeroed. + * + * Returns: true on success, false on error (with @errp set). + * @from_bit_idx: Starting index relative to the map of the page (inclusive) + * @to_bit_idx: Ending index relative to the map of the page (exclusive) + */ +static bool handle_zero_mapped_ram(RAMBlock *block, unsigned long from_bit_idx, + unsigned long to_bit_idx, Error **errp) +{ + ERRP_GUARD(); + ram_addr_t offset; + size_t size; + void *host; + + /* + * Zeroing is not needed for either -loadvm (RUN_STATE_PRELAUNCH), or + * -incoming (RUN_STATE_INMIGRATE). + */ + if (!runstate_check(RUN_STATE_RESTORE_VM)) { + return true; + } + + if (from_bit_idx >= to_bit_idx) { + return true; + } + + size = TARGET_PAGE_SIZE * (to_bit_idx - from_bit_idx); + offset = from_bit_idx << TARGET_PAGE_BITS; + host = host_from_ram_block_offset(block, offset); + if (!host) { + error_setg(errp, "zero page outside of ramblock %s range", + block->idstr); + return false; + } + ram_handle_zero(host, size); + + return true; +} + static bool read_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, long num_pages, unsigned long *bitmap, Error **errp) { ERRP_GUARD(); - unsigned long set_bit_idx, clear_bit_idx; + unsigned long set_bit_idx, clear_bit_idx = 0; ram_addr_t offset; void *host; size_t read, unread, size; @@ -4052,6 +4098,12 @@ static bool read_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, set_bit_idx < num_pages; set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) { + /* Zero pages */ + if (!handle_zero_mapped_ram(block, clear_bit_idx, set_bit_idx, errp)) { + return false; + } + + /* Non-zero pages */ clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1); unread = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx); @@ -4083,6 +4135,11 @@ static bool read_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, } } + /* Handle trailing 0 pages */ + if (!handle_zero_mapped_ram(block, clear_bit_idx, num_pages, errp)) { + return false; + } + return true; err: From 346479c304c712f79e7c378ae4a47ca661d1d563 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Thu, 16 Oct 2025 18:03:13 +0200 Subject: [PATCH 06/36] migration: Remove unused VMSTATE_UINTTL_EQUAL[_V]() macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The last use of VMSTATE_UINTTL_EQUAL() was removed in commit 16a2497bd44 ("target-ppc: Fix CPU migration from qemu-2.6 <-> later versions"), 9 years ago; remove it. Signed-off-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20251016160313.25751-1-philmd@linaro.org Signed-off-by: Peter Xu --- include/migration/cpu.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/migration/cpu.h b/include/migration/cpu.h index 65abe3c8cc..ca7cc0479e 100644 --- a/include/migration/cpu.h +++ b/include/migration/cpu.h @@ -19,8 +19,6 @@ #define VMSTATE_UINTTL_V(_f, _s, _v) \ VMSTATE_UINT64_V(_f, _s, _v) -#define VMSTATE_UINTTL_EQUAL_V(_f, _s, _v) \ - VMSTATE_UINT64_EQUAL_V(_f, _s, _v) #define VMSTATE_UINTTL_ARRAY_V(_f, _s, _n, _v) \ VMSTATE_UINT64_ARRAY_V(_f, _s, _n, _v) #define VMSTATE_UINTTL_2DARRAY_V(_f, _s, _n1, _n2, _v) \ @@ -40,8 +38,6 @@ #define VMSTATE_UINTTL_V(_f, _s, _v) \ VMSTATE_UINT32_V(_f, _s, _v) -#define VMSTATE_UINTTL_EQUAL_V(_f, _s, _v) \ - VMSTATE_UINT32_EQUAL_V(_f, _s, _v) #define VMSTATE_UINTTL_ARRAY_V(_f, _s, _n, _v) \ VMSTATE_UINT32_ARRAY_V(_f, _s, _n, _v) #define VMSTATE_UINTTL_2DARRAY_V(_f, _s, _n1, _n2, _v) \ @@ -53,8 +49,6 @@ #define VMSTATE_UINTTL(_f, _s) \ VMSTATE_UINTTL_V(_f, _s, 0) -#define VMSTATE_UINTTL_EQUAL(_f, _s) \ - VMSTATE_UINTTL_EQUAL_V(_f, _s, 0) #define VMSTATE_UINTTL_ARRAY(_f, _s, _n) \ VMSTATE_UINTTL_ARRAY_V(_f, _s, _n, 0) #define VMSTATE_UINTTL_2DARRAY(_f, _s, _n1, _n2) \ From 6f190736bfece55593a07fe0e16195221346fc04 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 21 Oct 2025 18:04:04 -0400 Subject: [PATCH 07/36] migration: Fix error leak in postcopy_ram_listen_thread() As reported and analyzed by Peter: https://lore.kernel.org/r/CAFEAcA9otBWtR7rPQ0Y9aBm+7ZWJzd4VWpXrAmGr8XspPn+zpw@mail.gmail.com Fix it by freeing the error. When at it, always reset the local_err pointer in both paths. Cc: Arun Menon Resolves: Coverity CID 1641390 Fixes: 94272d9b45 ("migration: Capture error in postcopy_ram_listen_thread()") Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20251021220407.2662288-2-peterx@redhat.com Signed-off-by: Peter Xu --- migration/savevm.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/migration/savevm.c b/migration/savevm.c index aafa40d779..232cae090b 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -2136,17 +2136,18 @@ static void *postcopy_ram_listen_thread(void *opaque) if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING && !migrate_postcopy_ram() && migrate_dirty_bitmaps()) { - error_report("%s: loadvm failed during postcopy: %d. All states " + error_report("%s: loadvm failed during postcopy: %d: %s. All states " "are migrated except dirty bitmaps. Some dirty " "bitmaps may be lost, and present migrated dirty " "bitmaps are correctly migrated and valid.", - __func__, load_res); + __func__, load_res, error_get_pretty(local_err)); + g_clear_pointer(&local_err, error_free); load_res = 0; /* prevent further exit() */ } else { error_prepend(&local_err, "loadvm failed during postcopy: %d: ", load_res); migrate_set_error(migr, local_err); - error_report_err(local_err); + g_clear_pointer(&local_err, error_report_err); migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, MIGRATION_STATUS_FAILED); } From 6a65fdee8a267436f3f18a35619c854bf255d8c1 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 21 Oct 2025 18:04:05 -0400 Subject: [PATCH 08/36] migration/cpr: Fix coverity report in cpr_exec_persist_state() Per reported and analyzed by Peter: https://lore.kernel.org/r/CAFEAcA_mUQ2NeoguR5efrhw7XYGofnriWEA=+Dg+Ocvyam1wAw@mail.gmail.com mfd leak is a false positive, try to use a coverity annotation (which I didn't find manual myself, but still give it a shot). Fix the other one by capture error if setenv() failed. When at it, pass the error to the top (cpr_state_save()). Along the way, changing all retval to bool when errp is around. Resolves: Coverity CID 1641391 Resolves: Coverity CID 1641392 Fixes: efc6587313 ("migration: cpr-exec save and load") Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20251021220407.2662288-3-peterx@redhat.com Signed-off-by: Peter Xu --- include/migration/cpr.h | 4 ++-- migration/cpr-exec.c | 10 ++++++++-- migration/cpr.c | 15 +++++++++------ migration/migration.c | 2 +- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/include/migration/cpr.h b/include/migration/cpr.h index a412d6663c..027cb98073 100644 --- a/include/migration/cpr.h +++ b/include/migration/cpr.h @@ -41,7 +41,7 @@ MigMode cpr_get_incoming_mode(void); void cpr_set_incoming_mode(MigMode mode); bool cpr_is_incoming(void); -int cpr_state_save(MigrationChannel *channel, Error **errp); +bool cpr_state_save(MigrationChannel *channel, Error **errp); int cpr_state_load(MigrationChannel *channel, Error **errp); void cpr_state_close(void); struct QIOChannel *cpr_state_ioc(void); @@ -56,7 +56,7 @@ QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp); void cpr_exec_init(void); QEMUFile *cpr_exec_output(Error **errp); QEMUFile *cpr_exec_input(Error **errp); -void cpr_exec_persist_state(QEMUFile *f); +bool cpr_exec_persist_state(QEMUFile *f, Error **errp); bool cpr_exec_has_state(void); void cpr_exec_unpersist_state(void); void cpr_exec_unpreserve_fds(void); diff --git a/migration/cpr-exec.c b/migration/cpr-exec.c index d57714bc5d..087ca94c87 100644 --- a/migration/cpr-exec.c +++ b/migration/cpr-exec.c @@ -40,16 +40,22 @@ static QEMUFile *qemu_file_new_fd_output(int fd, const char *name) return qemu_file_new_output(ioc); } -void cpr_exec_persist_state(QEMUFile *f) +bool cpr_exec_persist_state(QEMUFile *f, Error **errp) { QIOChannelFile *fioc = QIO_CHANNEL_FILE(qemu_file_get_ioc(f)); + /* coverity[leaked_storage] - mfd intentionally kept open across exec() */ int mfd = dup(fioc->fd); char val[16]; /* Remember mfd in environment for post-exec load */ qemu_clear_cloexec(mfd); snprintf(val, sizeof(val), "%d", mfd); - g_setenv(CPR_EXEC_STATE_NAME, val, 1); + if (!g_setenv(CPR_EXEC_STATE_NAME, val, 1)) { + error_setg(errp, "Setting env %s = %s failed", CPR_EXEC_STATE_NAME, val); + return false; + } + + return true; } static int cpr_exec_find_state(void) diff --git a/migration/cpr.c b/migration/cpr.c index 22dbac7c72..adee2a919a 100644 --- a/migration/cpr.c +++ b/migration/cpr.c @@ -176,7 +176,7 @@ bool cpr_is_incoming(void) return incoming_mode != MIG_MODE_NONE; } -int cpr_state_save(MigrationChannel *channel, Error **errp) +bool cpr_state_save(MigrationChannel *channel, Error **errp) { int ret; QEMUFile *f; @@ -190,10 +190,10 @@ int cpr_state_save(MigrationChannel *channel, Error **errp) } else if (mode == MIG_MODE_CPR_EXEC) { f = cpr_exec_output(errp); } else { - return 0; + return true; } if (!f) { - return -1; + return false; } qemu_put_be32(f, QEMU_CPR_FILE_MAGIC); @@ -202,11 +202,14 @@ int cpr_state_save(MigrationChannel *channel, Error **errp) ret = vmstate_save_state(f, &vmstate_cpr_state, &cpr_state, 0, errp); if (ret) { qemu_fclose(f); - return ret; + return false; } if (migrate_mode() == MIG_MODE_CPR_EXEC) { - cpr_exec_persist_state(f); + if (!cpr_exec_persist_state(f, errp)) { + qemu_fclose(f); + return false; + } } /* @@ -217,7 +220,7 @@ int cpr_state_save(MigrationChannel *channel, Error **errp) qio_channel_shutdown(qemu_file_get_ioc(f), QIO_CHANNEL_SHUTDOWN_WRITE, NULL); cpr_state_file = f; - return 0; + return true; } int cpr_state_load(MigrationChannel *channel, Error **errp) diff --git a/migration/migration.c b/migration/migration.c index a63b46bbef..c8a5712993 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2301,7 +2301,7 @@ void qmp_migrate(const char *uri, bool has_channels, return; } - if (cpr_state_save(cpr_channel, &local_err)) { + if (!cpr_state_save(cpr_channel, &local_err)) { goto out; } From 89471ef237ca683c90e9e8d216746b86ba4c1013 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 21 Oct 2025 18:04:06 -0400 Subject: [PATCH 09/36] migration/cpr: Fix UAF in cpr_exec_cb() when execvp() fails Per reported and analyzed by Peter: https://lore.kernel.org/r/CAFEAcA82ih8RVCm-u1oxiS0V2K4rV4jMzNb13pAV=e2ivmiDRA@mail.gmail.com Fix the issue by moving the error_setg_errno() earlier. When at it, clear argv variable after freed. Resolves: Coverity CID 1641397 Fixes: a3eae205c6 ("migration: cpr-exec mode") Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20251021220407.2662288-4-peterx@redhat.com Signed-off-by: Peter Xu --- migration/cpr-exec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/migration/cpr-exec.c b/migration/cpr-exec.c index 087ca94c87..d284f6e734 100644 --- a/migration/cpr-exec.c +++ b/migration/cpr-exec.c @@ -152,10 +152,10 @@ static void cpr_exec_cb(void *opaque) * exec should only fail if argv[0] is bogus, or has a permissions problem, * or the system is very short on resources. */ - g_strfreev(argv); + error_setg_errno(&err, errno, "execvp %s failed", argv[0]); + g_clear_pointer(&argv, g_strfreev); cpr_exec_unpreserve_fds(); - error_setg_errno(&err, errno, "execvp %s failed", argv[0]); error_report_err(error_copy(err)); migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); migrate_set_error(s, err); From ded3cf4aaf7af98f56822566cd96f3014fe5508c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 21 Oct 2025 18:04:07 -0400 Subject: [PATCH 10/36] migration/cpr: Avoid crashing QEMU when cpr-exec runs with no args If an user invokes cpr-exec without setting the exec args first, currently it'll crash QEMU. Avoid it, instead fail the QMP migrate command. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20251021220407.2662288-5-peterx@redhat.com Signed-off-by: Peter Xu --- migration/migration.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/migration/migration.c b/migration/migration.c index c8a5712993..4ed2a2e881 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -2195,6 +2195,12 @@ static bool migrate_prepare(MigrationState *s, bool resume, Error **errp) error_setg(errp, "Cannot use %s with CPR", conflict); return false; } + + if (s->parameters.mode == MIG_MODE_CPR_EXEC && + !s->parameters.cpr_exec_command) { + error_setg(errp, "Parameter 'cpr-exec-command' required for cpr-exec"); + return false; + } } if (migrate_init(s, errp)) { From 8922a758b29251d9009ec509e7f580b76509ab3d Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Thu, 23 Oct 2025 17:55:24 +0800 Subject: [PATCH 11/36] ram-block-attributes: fix interaction with hugetlb memory backends Currently, CoCo VMs can perform conversion at the base page granularity, which is the granularity that has to be tracked. In relevant setups, the target page size is assumed to be equal to the host page size, thus fixing the block size to the host page size. However, since private memory and shared memory have different backend at present, users can specify shared memory with a hugetlbfs backend while private memory with guest_memfd backend only supports 4K page size. In this scenario, ram_block->page_size is different from the host page size which will trigger an assertion when retrieving the block size. To address this, return the host page size directly to relax the restriction. This changes fixes a regression of using hugetlbfs backend for shared memory within CoCo VMs, with or without VFIO devices' presence. Acked-by: David Hildenbrand Tested-by: Farrah Chen Signed-off-by: Chenyi Qiang Link: https://lore.kernel.org/r/20251023095526.48365-2-chenyi.qiang@intel.com [peterx: fix subject, per david] Cc: qemu-stable Signed-off-by: Peter Xu --- system/ram-block-attributes.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/system/ram-block-attributes.c b/system/ram-block-attributes.c index 68e8a02703..a7579de5b4 100644 --- a/system/ram-block-attributes.c +++ b/system/ram-block-attributes.c @@ -22,16 +22,14 @@ OBJECT_DEFINE_SIMPLE_TYPE_WITH_INTERFACES(RamBlockAttributes, { }) static size_t -ram_block_attributes_get_block_size(const RamBlockAttributes *attr) +ram_block_attributes_get_block_size(void) { /* * Because page conversion could be manipulated in the size of at least 4K * or 4K aligned, Use the host page size as the granularity to track the * memory attribute. */ - g_assert(attr && attr->ram_block); - g_assert(attr->ram_block->page_size == qemu_real_host_page_size()); - return attr->ram_block->page_size; + return qemu_real_host_page_size(); } @@ -40,7 +38,7 @@ ram_block_attributes_rdm_is_populated(const RamDiscardManager *rdm, const MemoryRegionSection *section) { const RamBlockAttributes *attr = RAM_BLOCK_ATTRIBUTES(rdm); - const size_t block_size = ram_block_attributes_get_block_size(attr); + const size_t block_size = ram_block_attributes_get_block_size(); const uint64_t first_bit = section->offset_within_region / block_size; const uint64_t last_bit = first_bit + int128_get64(section->size) / block_size - 1; @@ -81,7 +79,7 @@ ram_block_attributes_for_each_populated_section(const RamBlockAttributes *attr, { unsigned long first_bit, last_bit; uint64_t offset, size; - const size_t block_size = ram_block_attributes_get_block_size(attr); + const size_t block_size = ram_block_attributes_get_block_size(); int ret = 0; first_bit = section->offset_within_region / block_size; @@ -122,7 +120,7 @@ ram_block_attributes_for_each_discarded_section(const RamBlockAttributes *attr, { unsigned long first_bit, last_bit; uint64_t offset, size; - const size_t block_size = ram_block_attributes_get_block_size(attr); + const size_t block_size = ram_block_attributes_get_block_size(); int ret = 0; first_bit = section->offset_within_region / block_size; @@ -163,7 +161,7 @@ ram_block_attributes_rdm_get_min_granularity(const RamDiscardManager *rdm, const RamBlockAttributes *attr = RAM_BLOCK_ATTRIBUTES(rdm); g_assert(mr == attr->ram_block->mr); - return ram_block_attributes_get_block_size(attr); + return ram_block_attributes_get_block_size(); } static void @@ -265,7 +263,7 @@ ram_block_attributes_is_valid_range(RamBlockAttributes *attr, uint64_t offset, g_assert(mr); uint64_t region_size = memory_region_size(mr); - const size_t block_size = ram_block_attributes_get_block_size(attr); + const size_t block_size = ram_block_attributes_get_block_size(); if (!QEMU_IS_ALIGNED(offset, block_size) || !QEMU_IS_ALIGNED(size, block_size)) { @@ -322,7 +320,7 @@ int ram_block_attributes_state_change(RamBlockAttributes *attr, uint64_t offset, uint64_t size, bool to_discard) { - const size_t block_size = ram_block_attributes_get_block_size(attr); + const size_t block_size = ram_block_attributes_get_block_size(); const unsigned long first_bit = offset / block_size; const unsigned long nbits = size / block_size; const unsigned long last_bit = first_bit + nbits - 1; From b2ceb87b1a210d91a29d525590eb164d1121b8a1 Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Thu, 23 Oct 2025 17:55:25 +0800 Subject: [PATCH 12/36] ram-block-attributes: Unify the retrieval of the block size There's an existing helper function designed to obtain the block size. Modify ram_block_attribute_create() to use this function for consistency. Tested-by: Farrah Chen Signed-off-by: Chenyi Qiang Link: https://lore.kernel.org/r/20251023095526.48365-3-chenyi.qiang@intel.com [peterx: fix double spaces, per david] Signed-off-by: Peter Xu --- system/ram-block-attributes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/system/ram-block-attributes.c b/system/ram-block-attributes.c index a7579de5b4..fb7c5c2746 100644 --- a/system/ram-block-attributes.c +++ b/system/ram-block-attributes.c @@ -390,7 +390,7 @@ int ram_block_attributes_state_change(RamBlockAttributes *attr, RamBlockAttributes *ram_block_attributes_create(RAMBlock *ram_block) { - const int block_size = qemu_real_host_page_size(); + const int block_size = ram_block_attributes_get_block_size(); RamBlockAttributes *attr; MemoryRegion *mr = ram_block->mr; From ae00f0088ffd1e9ee6a84d79dccea1820ce873ac Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 22 Oct 2025 15:04:25 -0400 Subject: [PATCH 13/36] migration/qmp: Update "resume" flag doc in "migrate" command It wasn't obvious how the resume flag should be used when staring at the QAPI doc. Enrich it to be crystal clear. Reported-by: Markus Armbruster Reviewed-by: Markus Armbruster Link: https://lore.kernel.org/r/20251022190425.2730441-1-peterx@redhat.com [peterx: amended wordings, per markus] Signed-off-by: Peter Xu --- qapi/migration.json | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/qapi/migration.json b/qapi/migration.json index be0f3fcc12..c7a6737cc1 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -1732,7 +1732,10 @@ # @detach: this argument exists only for compatibility reasons and is # ignored by QEMU # -# @resume: resume one paused migration, default "off". (since 3.0) +# @resume: when set, use the new uri/channels specified to resume paused +# postcopy migration. This flag should only be used if the previous +# postcopy migration was interrupted. The command will fail unless +# migration is in "postcopy-paused" state. (default: false, since 3.0) # # Features: # From c2a06e8f28a14cfb26cc442269176ae60d1178ef Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 23 Oct 2025 12:16:57 -0400 Subject: [PATCH 14/36] migration/cpr: Document obscure usage of g_autofree when parse str HMP parsing of cpr_exec_command contains an obscure usage of g_autofree. Provide a document for it to be clear that it's intentional, rather than memory leaked. Cc: Dr. David Alan Gilbert Reported-by: Peter Maydell Reviewed-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20251023161657.2821652-1-peterx@redhat.com Signed-off-by: Peter Xu --- migration/migration-hmp-cmds.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 847d18faaa..79426bf5d7 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -734,6 +734,12 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) visit_type_bool(v, param, &p->direct_io, &err); break; case MIGRATION_PARAMETER_CPR_EXEC_COMMAND: { + /* + * NOTE: g_autofree will only auto g_free() the strv array when + * needed, it will not free the strings within the array. It's + * intentional: when strv is set, the ownership of the strings will + * always be passed to p->cpr_exec_command via QAPI_LIST_APPEND(). + */ g_autofree char **strv = NULL; g_autoptr(GError) gerr = NULL; strList **tail = &p->cpr_exec_command; From 75e2cb144191ecdbba87cfea3608cdc0664c8142 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Mon, 21 Jul 2025 14:52:20 +0800 Subject: [PATCH 15/36] hostmem/shm: Allow shm memory backend serve as shared memory for coco-VMs shm can surely serve as the shared memory for coco-VMs. But currently it doesn't check the backend->guest_memfd to pass down the RAM_GUEST_MEMFD flag. It leads to failure when creating coco-VMs (e.g., TDX guest) which require private mmeory. Set and pass down RAM_GUEST_MEMFD when backend->guest_memfd is true, to allow shm memory backend serve as shared memory for coco-VMs. Cc: Stefano Garzarella Cc: qemu-stable Signed-off-by: Xiaoyao Li Acked-by: David Hildenbrand Acked-by: Stefano Garzarella Link: https://lore.kernel.org/r/20250721065220.895606-1-xiaoyao.li@intel.com Signed-off-by: Peter Xu --- backends/hostmem-shm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/backends/hostmem-shm.c b/backends/hostmem-shm.c index f66211a2ec..806e2670e0 100644 --- a/backends/hostmem-shm.c +++ b/backends/hostmem-shm.c @@ -54,6 +54,7 @@ have_fd: /* Let's do the same as memory-backend-ram,share=on would do. */ ram_flags = RAM_SHARED; ram_flags |= backend->reserve ? 0 : RAM_NORESERVE; + ram_flags |= backend->guest_memfd ? RAM_GUEST_MEMFD : 0; return memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend), backend_name, backend->size, From 986c3292c6b7dbd12868d06555c82757aaab43b6 Mon Sep 17 00:00:00 2001 From: Arun Menon Date: Tue, 28 Oct 2025 11:51:02 +0530 Subject: [PATCH 16/36] migration: Fix regression of passing error_fatal into vmstate_load_state() error_fatal is passed to vmstate_load_state() and vmstate_save_state() functions. This was introduced in commit c632ffbd74. This would exit(1) on error, and therefore does not allow to propagate the error back to the caller. To maintain consistency with prior error handling i.e. either propagating the error to the caller or reporting it, we must set the error within a local Error object instead of using error_fatal. Reviewed-by: Cornelia Huck Signed-off-by: Arun Menon Reviewed-by: Akihiko Odaki Link: https://lore.kernel.org/r/20251028-solve_error_fatal_regression-v2-1-dab24c808a28@redhat.com [peterx: always uninit var ret, per Akihiko] [peterx: touchups on line ordering, spacings etc.] Signed-off-by: Peter Xu --- hw/display/virtio-gpu.c | 21 ++++++++++++++------- hw/pci/pci.c | 15 +++++++++++++-- hw/s390x/virtio-ccw.c | 17 +++++++++++++++-- hw/scsi/spapr_vscsi.c | 10 ++++++++-- hw/virtio/virtio-mmio.c | 15 +++++++++++++-- hw/virtio/virtio-pci.c | 15 +++++++++++++-- hw/virtio/virtio.c | 10 +++++++--- 7 files changed, 83 insertions(+), 20 deletions(-) diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c index 3a555125be..43e88a4daf 100644 --- a/hw/display/virtio-gpu.c +++ b/hw/display/virtio-gpu.c @@ -1225,7 +1225,8 @@ static int virtio_gpu_save(QEMUFile *f, void *opaque, size_t size, { VirtIOGPU *g = opaque; struct virtio_gpu_simple_resource *res; - int i; + Error *err = NULL; + int i, ret; /* in 2d mode we should never find unprocessed commands here */ assert(QTAILQ_EMPTY(&g->cmdq)); @@ -1248,8 +1249,12 @@ static int virtio_gpu_save(QEMUFile *f, void *opaque, size_t size, } qemu_put_be32(f, 0); /* end of list */ - return vmstate_save_state(f, &vmstate_virtio_gpu_scanouts, g, NULL, - &error_fatal); + ret = vmstate_save_state(f, &vmstate_virtio_gpu_scanouts, g, NULL, + &err); + if (ret < 0) { + error_report_err(err); + } + return ret; } static bool virtio_gpu_load_restore_mapping(VirtIOGPU *g, @@ -1288,7 +1293,7 @@ static int virtio_gpu_load(QEMUFile *f, void *opaque, size_t size, Error *err = NULL; struct virtio_gpu_simple_resource *res; uint32_t resource_id, pformat; - int i; + int i, ret; g->hostmem = 0; @@ -1348,9 +1353,11 @@ static int virtio_gpu_load(QEMUFile *f, void *opaque, size_t size, } /* load & apply scanout state */ - vmstate_load_state(f, &vmstate_virtio_gpu_scanouts, g, 1, &error_fatal); - - return 0; + ret = vmstate_load_state(f, &vmstate_virtio_gpu_scanouts, g, 1, &err); + if (ret < 0) { + error_report_err(err); + } + return ret; } static int virtio_gpu_blob_save(QEMUFile *f, void *opaque, size_t size, diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 738b0c41ae..b1eba348e0 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -921,21 +921,32 @@ const VMStateDescription vmstate_pci_device = { void pci_device_save(PCIDevice *s, QEMUFile *f) { + Error *local_err = NULL; + int ret; + /* Clear interrupt status bit: it is implicit * in irq_state which we are saving. * This makes us compatible with old devices * which never set or clear this bit. */ s->config[PCI_STATUS] &= ~PCI_STATUS_INTERRUPT; - vmstate_save_state(f, &vmstate_pci_device, s, NULL, &error_fatal); + ret = vmstate_save_state(f, &vmstate_pci_device, s, NULL, &local_err); + if (ret < 0) { + error_report_err(local_err); + } /* Restore the interrupt status bit. */ pci_update_irq_status(s); } int pci_device_load(PCIDevice *s, QEMUFile *f) { + Error *local_err = NULL; int ret; + ret = vmstate_load_state(f, &vmstate_pci_device, s, s->version_id, - &error_fatal); + &local_err); + if (ret < 0) { + error_report_err(local_err); + } /* Restore the interrupt status bit. */ pci_update_irq_status(s); return ret; diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c index 4cb1ced001..4a3ffb84f8 100644 --- a/hw/s390x/virtio-ccw.c +++ b/hw/s390x/virtio-ccw.c @@ -1130,13 +1130,26 @@ static int virtio_ccw_load_queue(DeviceState *d, int n, QEMUFile *f) static void virtio_ccw_save_config(DeviceState *d, QEMUFile *f) { VirtioCcwDevice *dev = VIRTIO_CCW_DEVICE(d); - vmstate_save_state(f, &vmstate_virtio_ccw_dev, dev, NULL, &error_fatal); + Error *local_err = NULL; + int ret; + + ret = vmstate_save_state(f, &vmstate_virtio_ccw_dev, dev, NULL, &local_err); + if (ret < 0) { + error_report_err(local_err); + } } static int virtio_ccw_load_config(DeviceState *d, QEMUFile *f) { VirtioCcwDevice *dev = VIRTIO_CCW_DEVICE(d); - return vmstate_load_state(f, &vmstate_virtio_ccw_dev, dev, 1, &error_fatal); + Error *local_err = NULL; + int ret; + + ret = vmstate_load_state(f, &vmstate_virtio_ccw_dev, dev, 1, &local_err); + if (ret < 0) { + error_report_err(local_err); + } + return ret; } static void virtio_ccw_pre_plugged(DeviceState *d, Error **errp) diff --git a/hw/scsi/spapr_vscsi.c b/hw/scsi/spapr_vscsi.c index f0a7dd2b88..a6591319db 100644 --- a/hw/scsi/spapr_vscsi.c +++ b/hw/scsi/spapr_vscsi.c @@ -628,10 +628,16 @@ static const VMStateDescription vmstate_spapr_vscsi_req = { static void vscsi_save_request(QEMUFile *f, SCSIRequest *sreq) { vscsi_req *req = sreq->hba_private; + Error *local_err = NULL; + int rc; + assert(req->active); - vmstate_save_state(f, &vmstate_spapr_vscsi_req, req, NULL, &error_fatal); - + rc = vmstate_save_state(f, &vmstate_spapr_vscsi_req, req, NULL, &local_err); + if (rc < 0) { + error_report_err(local_err); + return; + } trace_spapr_vscsi_save_request(req->qtag, req->cur_desc_num, req->cur_desc_offset); } diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c index fb58c36452..c05c00bcd4 100644 --- a/hw/virtio/virtio-mmio.c +++ b/hw/virtio/virtio-mmio.c @@ -612,15 +612,26 @@ static const VMStateDescription vmstate_virtio_mmio = { static void virtio_mmio_save_extra_state(DeviceState *opaque, QEMUFile *f) { VirtIOMMIOProxy *proxy = VIRTIO_MMIO(opaque); + Error *local_err = NULL; + int ret; - vmstate_save_state(f, &vmstate_virtio_mmio, proxy, NULL, &error_fatal); + ret = vmstate_save_state(f, &vmstate_virtio_mmio, proxy, NULL, &local_err); + if (ret < 0) { + error_report_err(local_err); + } } static int virtio_mmio_load_extra_state(DeviceState *opaque, QEMUFile *f) { VirtIOMMIOProxy *proxy = VIRTIO_MMIO(opaque); + Error *local_err = NULL; + int ret; - return vmstate_load_state(f, &vmstate_virtio_mmio, proxy, 1, &error_fatal); + ret = vmstate_load_state(f, &vmstate_virtio_mmio, proxy, 1, &local_err); + if (ret < 0) { + error_report_err(local_err); + } + return ret; } static bool virtio_mmio_has_extra_state(DeviceState *opaque) diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 937e22f08a..99cb30fe59 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -187,15 +187,26 @@ static bool virtio_pci_has_extra_state(DeviceState *d) static void virtio_pci_save_extra_state(DeviceState *d, QEMUFile *f) { VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + Error *local_err = NULL; + int ret; - vmstate_save_state(f, &vmstate_virtio_pci, proxy, NULL, &error_fatal); + ret = vmstate_save_state(f, &vmstate_virtio_pci, proxy, NULL, &local_err); + if (ret < 0) { + error_report_err(local_err); + } } static int virtio_pci_load_extra_state(DeviceState *d, QEMUFile *f) { VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + Error *local_err = NULL; + int ret; - return vmstate_load_state(f, &vmstate_virtio_pci, proxy, 1, &error_fatal); + ret = vmstate_load_state(f, &vmstate_virtio_pci, proxy, 1, &local_err); + if (ret < 0) { + error_report_err(local_err); + } + return ret; } static void virtio_pci_save_queue(DeviceState *d, int n, QEMUFile *f) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 153ee0a0cf..257cda506a 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -3030,7 +3030,7 @@ int virtio_save(VirtIODevice *vdev, QEMUFile *f) VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); uint32_t guest_features_lo = (vdev->guest_features & 0xffffffff); - int i; + int i, ret; Error *local_err = NULL; if (k->save_config) { @@ -3075,7 +3075,7 @@ int virtio_save(VirtIODevice *vdev, QEMUFile *f) } if (vdc->vmsd) { - int ret = vmstate_save_state(f, vdc->vmsd, vdev, NULL, &local_err); + ret = vmstate_save_state(f, vdc->vmsd, vdev, NULL, &local_err); if (ret) { error_report_err(local_err); return ret; @@ -3083,7 +3083,11 @@ int virtio_save(VirtIODevice *vdev, QEMUFile *f) } /* Subsections */ - return vmstate_save_state(f, &vmstate_virtio, vdev, NULL, &error_fatal); + ret = vmstate_save_state(f, &vmstate_virtio, vdev, NULL, &local_err); + if (ret < 0) { + error_report_err(local_err); + } + return ret; } /* A wrapper for use as a VMState .put function */ From 74343a438c796cd1e679db94294c666469a8c4a8 Mon Sep 17 00:00:00 2001 From: Bin Guo Date: Sat, 25 Oct 2025 04:55:32 +0800 Subject: [PATCH 17/36] migration: Don't free the reason after calling migrate_add_blocker Function migrate_add_blocker will free the reason and set it to NULL if failure is returned. Signed-off-by: Bin Guo Reviewed-by: Markus Armbruster Link: https://lore.kernel.org/r/20251024205532.19883-1-guobin@linux.alibaba.com Signed-off-by: Peter Xu --- hw/intc/arm_gicv3_kvm.c | 1 - target/i386/sev.c | 1 - 2 files changed, 2 deletions(-) diff --git a/hw/intc/arm_gicv3_kvm.c b/hw/intc/arm_gicv3_kvm.c index 66b0dddfd4..6f311e37ef 100644 --- a/hw/intc/arm_gicv3_kvm.c +++ b/hw/intc/arm_gicv3_kvm.c @@ -841,7 +841,6 @@ static void kvm_arm_gicv3_realize(DeviceState *dev, Error **errp) error_setg(&kvm_nv_migration_blocker, "Live migration disabled because KVM nested virt is enabled"); if (migrate_add_blocker(&kvm_nv_migration_blocker, errp)) { - error_free(kvm_nv_migration_blocker); return; } diff --git a/target/i386/sev.c b/target/i386/sev.c index 1057b8ab2c..fd2dada013 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -1661,7 +1661,6 @@ sev_snp_launch_finish(SevCommonState *sev_common) ret = migrate_add_blocker(&sev_mig_blocker, &local_err); if (local_err) { error_report_err(local_err); - error_free(sev_mig_blocker); exit(1); } } From 75a9f080c2195054cd6f37c75615fc4908283051 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Mon, 27 Oct 2025 07:45:01 +0100 Subject: [PATCH 18/36] migration: Use unsigned instead of int for bit set of MigMode Signed operands in bitwise operations are unwise. I believe they're safe here, but avoiding them is easy, so let's do that. Signed-off-by: Markus Armbruster Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20251027064503.1074255-2-armbru@redhat.com Signed-off-by: Peter Xu --- migration/migration.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 4ed2a2e881..478c76bc25 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1672,9 +1672,9 @@ void migration_cancel(void) } } -static int get_modes(MigMode mode, va_list ap); +static unsigned get_modes(MigMode mode, va_list ap); -static void add_notifiers(NotifierWithReturn *notify, int modes) +static void add_notifiers(NotifierWithReturn *notify, unsigned modes) { for (MigMode mode = 0; mode < MIG_MODE__MAX; mode++) { if (modes & BIT(mode)) { @@ -1687,7 +1687,7 @@ static void add_notifiers(NotifierWithReturn *notify, int modes) void migration_add_notifier_modes(NotifierWithReturn *notify, MigrationNotifyFunc func, MigMode mode, ...) { - int modes; + unsigned modes; va_list ap; va_start(ap, mode); @@ -1876,7 +1876,7 @@ static bool is_busy(Error **reasonp, Error **errp) return false; } -static bool is_only_migratable(Error **reasonp, Error **errp, int modes) +static bool is_only_migratable(Error **reasonp, Error **errp, unsigned modes) { ERRP_GUARD(); @@ -1890,9 +1890,9 @@ static bool is_only_migratable(Error **reasonp, Error **errp, int modes) return false; } -static int get_modes(MigMode mode, va_list ap) +static unsigned get_modes(MigMode mode, va_list ap) { - int modes = 0; + unsigned modes = 0; while (mode != -1 && mode != MIG_MODE_ALL) { assert(mode >= MIG_MODE_NORMAL && mode < MIG_MODE__MAX); @@ -1905,7 +1905,7 @@ static int get_modes(MigMode mode, va_list ap) return modes; } -static int add_blockers(Error **reasonp, Error **errp, int modes) +static int add_blockers(Error **reasonp, Error **errp, unsigned modes) { for (MigMode mode = 0; mode < MIG_MODE__MAX; mode++) { if (modes & BIT(mode)) { @@ -1928,7 +1928,7 @@ int migrate_add_blocker_normal(Error **reasonp, Error **errp) int migrate_add_blocker_modes(Error **reasonp, Error **errp, MigMode mode, ...) { - int modes; + unsigned modes; va_list ap; va_start(ap, mode); @@ -1945,7 +1945,7 @@ int migrate_add_blocker_modes(Error **reasonp, Error **errp, MigMode mode, ...) int migrate_add_blocker_internal(Error **reasonp, Error **errp) { - int modes = BIT(MIG_MODE__MAX) - 1; + unsigned modes = BIT(MIG_MODE__MAX) - 1; if (is_busy(reasonp, errp)) { return -EBUSY; From 3ca0a0ab05ca4c96c0e6c9cd39c328ffb068b539 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Mon, 27 Oct 2025 07:45:02 +0100 Subject: [PATCH 19/36] migration: Use bitset of MigMode instead of variable arguments migrate_add_blocker_modes() and migration_add_notifier_modes use variable arguments for a set of migration modes. The variable arguments get collected into a bitset for processsing. Take a bitset argument instead, it's simpler. Signed-off-by: Markus Armbruster Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20251027064503.1074255-3-armbru@redhat.com Signed-off-by: Peter Xu --- hw/vfio/container-legacy.c | 6 +++--- hw/vfio/cpr-iommufd.c | 6 +++--- hw/vfio/cpr-legacy.c | 8 +++---- hw/vfio/cpr.c | 5 ++--- hw/vfio/device.c | 4 ++-- include/migration/blocker.h | 9 +++----- include/migration/misc.h | 10 ++++----- migration/migration.c | 43 ++++++------------------------------- stubs/migr-blocker.c | 2 +- system/physmem.c | 8 +++---- 10 files changed, 33 insertions(+), 68 deletions(-) diff --git a/hw/vfio/container-legacy.c b/hw/vfio/container-legacy.c index 8e9639603e..32c260b345 100644 --- a/hw/vfio/container-legacy.c +++ b/hw/vfio/container-legacy.c @@ -977,9 +977,9 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, if (vbasedev->mdev) { error_setg(&vbasedev->cpr.mdev_blocker, "CPR does not support vfio mdev %s", vbasedev->name); - if (migrate_add_blocker_modes(&vbasedev->cpr.mdev_blocker, errp, - MIG_MODE_CPR_TRANSFER, MIG_MODE_CPR_EXEC, - -1) < 0) { + if (migrate_add_blocker_modes(&vbasedev->cpr.mdev_blocker, + BIT(MIG_MODE_CPR_TRANSFER) | BIT(MIG_MODE_CPR_EXEC), + errp) < 0) { goto hiod_unref_exit; } } diff --git a/hw/vfio/cpr-iommufd.c b/hw/vfio/cpr-iommufd.c index 8a4d65de5e..c244db88fb 100644 --- a/hw/vfio/cpr-iommufd.c +++ b/hw/vfio/cpr-iommufd.c @@ -158,9 +158,9 @@ bool vfio_iommufd_cpr_register_iommufd(IOMMUFDBackend *be, Error **errp) Error **cpr_blocker = &be->cpr_blocker; if (!vfio_cpr_supported(be, cpr_blocker)) { - return migrate_add_blocker_modes(cpr_blocker, errp, - MIG_MODE_CPR_TRANSFER, - MIG_MODE_CPR_EXEC, -1) == 0; + return migrate_add_blocker_modes(cpr_blocker, + BIT(MIG_MODE_CPR_TRANSFER) | BIT(MIG_MODE_CPR_TRANSFER), + errp); } vmstate_register(NULL, -1, &iommufd_cpr_vmstate, be); diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c index 7184c93991..86c943158e 100644 --- a/hw/vfio/cpr-legacy.c +++ b/hw/vfio/cpr-legacy.c @@ -176,9 +176,9 @@ bool vfio_legacy_cpr_register_container(VFIOLegacyContainer *container, MIG_MODE_CPR_REBOOT); if (!vfio_cpr_supported(container, cpr_blocker)) { - return migrate_add_blocker_modes(cpr_blocker, errp, - MIG_MODE_CPR_TRANSFER, - MIG_MODE_CPR_EXEC, -1) == 0; + return migrate_add_blocker_modes(cpr_blocker, + BIT(MIG_MODE_CPR_TRANSFER) | BIT(MIG_MODE_CPR_EXEC), + errp) == 0; } vfio_cpr_add_kvm_notifier(); @@ -187,7 +187,7 @@ bool vfio_legacy_cpr_register_container(VFIOLegacyContainer *container, migration_add_notifier_modes(&container->cpr.transfer_notifier, vfio_cpr_fail_notifier, - MIG_MODE_CPR_TRANSFER, MIG_MODE_CPR_EXEC, -1); + BIT(MIG_MODE_CPR_TRANSFER) | BIT(MIG_MODE_CPR_EXEC)); return true; } diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c index db462aabcb..998230d271 100644 --- a/hw/vfio/cpr.c +++ b/hw/vfio/cpr.c @@ -197,8 +197,7 @@ void vfio_cpr_add_kvm_notifier(void) if (!kvm_close_notifier.notify) { migration_add_notifier_modes(&kvm_close_notifier, vfio_cpr_kvm_close_notifier, - MIG_MODE_CPR_TRANSFER, MIG_MODE_CPR_EXEC, - -1); + BIT(MIG_MODE_CPR_TRANSFER) | BIT(MIG_MODE_CPR_EXEC)); } } @@ -285,7 +284,7 @@ void vfio_cpr_pci_register_device(VFIOPCIDevice *vdev) { migration_add_notifier_modes(&vdev->cpr.transfer_notifier, vfio_cpr_pci_notifier, - MIG_MODE_CPR_TRANSFER, MIG_MODE_CPR_EXEC, -1); + BIT(MIG_MODE_CPR_TRANSFER) | BIT(MIG_MODE_CPR_EXEC)); } void vfio_cpr_pci_unregister_device(VFIOPCIDevice *vdev) diff --git a/hw/vfio/device.c b/hw/vfio/device.c index 8b63e765ac..76869828fc 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -345,8 +345,8 @@ bool vfio_device_get_name(VFIODevice *vbasedev, Error **errp) "vfio device with fd=%d needs an id property", vbasedev->fd); return migrate_add_blocker_modes(&vbasedev->cpr.id_blocker, - errp, MIG_MODE_CPR_TRANSFER, - -1) == 0; + BIT(MIG_MODE_CPR_TRANSFER), + errp) == 0; } } } diff --git a/include/migration/blocker.h b/include/migration/blocker.h index a687ac0efe..80b75ad5cb 100644 --- a/include/migration/blocker.h +++ b/include/migration/blocker.h @@ -16,8 +16,6 @@ #include "qapi/qapi-types-migration.h" -#define MIG_MODE_ALL MIG_MODE__MAX - /** * @migrate_add_blocker - prevent all modes of migration from proceeding * @@ -82,16 +80,15 @@ int migrate_add_blocker_normal(Error **reasonp, Error **errp); * * @reasonp - address of an error to be returned whenever migration is attempted * - * @errp - [out] The reason (if any) we cannot block migration right now. + * @modes - the migration modes to be blocked, a bit set of MigMode * - * @mode - one or more migration modes to be blocked. The list is terminated - * by -1 or MIG_MODE_ALL. For the latter, all modes are blocked. + * @errp - [out] The reason (if any) we cannot block migration right now. * * @returns - 0 on success, -EBUSY/-EACCES on failure, with errp set. * * *@reasonp is freed and set to NULL if failure is returned. * On success, the caller must not free *@reasonp before the blocker is removed. */ -int migrate_add_blocker_modes(Error **reasonp, Error **errp, MigMode mode, ...); +int migrate_add_blocker_modes(Error **reasonp, unsigned modes, Error **errp); #endif diff --git a/include/migration/misc.h b/include/migration/misc.h index 592b93021e..e26d418a6e 100644 --- a/include/migration/misc.h +++ b/include/migration/misc.h @@ -90,18 +90,18 @@ void migration_add_notifier(NotifierWithReturn *notify, MigrationNotifyFunc func); /* - * Same as migration_add_notifier, but applies to be specified @mode. + * Same as migration_add_notifier, but applies to the specified @mode + * instead of MIG_MODE_NORMAL. */ void migration_add_notifier_mode(NotifierWithReturn *notify, MigrationNotifyFunc func, MigMode mode); /* - * Same as migration_add_notifier, but applies to all @mode in the argument - * list. The list is terminated by -1 or MIG_MODE_ALL. For the latter, - * the notifier is added for all modes. + * Same as migration_add_notifier, but applies to the specified @modes + * (a bitset of MigMode). */ void migration_add_notifier_modes(NotifierWithReturn *notify, - MigrationNotifyFunc func, MigMode mode, ...); + MigrationNotifyFunc func, unsigned modes); /* * Remove a notifier from all modes. diff --git a/migration/migration.c b/migration/migration.c index 478c76bc25..f613b95287 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1672,8 +1672,6 @@ void migration_cancel(void) } } -static unsigned get_modes(MigMode mode, va_list ap); - static void add_notifiers(NotifierWithReturn *notify, unsigned modes) { for (MigMode mode = 0; mode < MIG_MODE__MAX; mode++) { @@ -1685,15 +1683,8 @@ static void add_notifiers(NotifierWithReturn *notify, unsigned modes) } void migration_add_notifier_modes(NotifierWithReturn *notify, - MigrationNotifyFunc func, MigMode mode, ...) + MigrationNotifyFunc func, unsigned modes) { - unsigned modes; - va_list ap; - - va_start(ap, mode); - modes = get_modes(mode, ap); - va_end(ap); - notify->notify = (NotifierWithReturnFunc)func; add_notifiers(notify, modes); } @@ -1701,13 +1692,13 @@ void migration_add_notifier_modes(NotifierWithReturn *notify, void migration_add_notifier_mode(NotifierWithReturn *notify, MigrationNotifyFunc func, MigMode mode) { - migration_add_notifier_modes(notify, func, mode, -1); + migration_add_notifier_modes(notify, func, BIT(mode)); } void migration_add_notifier(NotifierWithReturn *notify, MigrationNotifyFunc func) { - migration_add_notifier_modes(notify, func, MIG_MODE_NORMAL, -1); + migration_add_notifier_mode(notify, func, MIG_MODE_NORMAL); } void migration_remove_notifier(NotifierWithReturn *notify) @@ -1890,21 +1881,6 @@ static bool is_only_migratable(Error **reasonp, Error **errp, unsigned modes) return false; } -static unsigned get_modes(MigMode mode, va_list ap) -{ - unsigned modes = 0; - - while (mode != -1 && mode != MIG_MODE_ALL) { - assert(mode >= MIG_MODE_NORMAL && mode < MIG_MODE__MAX); - modes |= BIT(mode); - mode = va_arg(ap, MigMode); - } - if (mode == MIG_MODE_ALL) { - modes = BIT(MIG_MODE__MAX) - 1; - } - return modes; -} - static int add_blockers(Error **reasonp, Error **errp, unsigned modes) { for (MigMode mode = 0; mode < MIG_MODE__MAX; mode++) { @@ -1918,23 +1894,16 @@ static int add_blockers(Error **reasonp, Error **errp, unsigned modes) int migrate_add_blocker(Error **reasonp, Error **errp) { - return migrate_add_blocker_modes(reasonp, errp, MIG_MODE_ALL); + return migrate_add_blocker_modes(reasonp, -1u, errp); } int migrate_add_blocker_normal(Error **reasonp, Error **errp) { - return migrate_add_blocker_modes(reasonp, errp, MIG_MODE_NORMAL, -1); + return migrate_add_blocker_modes(reasonp, BIT(MIG_MODE_NORMAL), errp); } -int migrate_add_blocker_modes(Error **reasonp, Error **errp, MigMode mode, ...) +int migrate_add_blocker_modes(Error **reasonp, unsigned modes, Error **errp) { - unsigned modes; - va_list ap; - - va_start(ap, mode); - modes = get_modes(mode, ap); - va_end(ap); - if (is_only_migratable(reasonp, errp, modes)) { return -EACCES; } else if (is_busy(reasonp, errp)) { diff --git a/stubs/migr-blocker.c b/stubs/migr-blocker.c index 11cbff268f..e54c7160d3 100644 --- a/stubs/migr-blocker.c +++ b/stubs/migr-blocker.c @@ -11,7 +11,7 @@ int migrate_add_blocker_normal(Error **reasonp, Error **errp) return 0; } -int migrate_add_blocker_modes(Error **reasonp, Error **errp, MigMode mode, ...) +int migrate_add_blocker_modes(Error **reasonp, unsigned modes, Error **errp) { return 0; } diff --git a/system/physmem.c b/system/physmem.c index a340ca3e61..a7e2a5d07f 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -2255,8 +2255,8 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) "Memory region %s uses guest_memfd, " "which is not supported with CPR.", memory_region_name(new_block->mr)); - migrate_add_blocker_modes(&new_block->cpr_blocker, errp, - MIG_MODE_CPR_TRANSFER, -1); + migrate_add_blocker_modes(&new_block->cpr_blocker, + BIT(MIG_MODE_CPR_TRANSFER), errp); } } @@ -4462,8 +4462,8 @@ void ram_block_add_cpr_blocker(RAMBlock *rb, Error **errp) "Memory region %s is not compatible with CPR. share=on is " "required for memory-backend objects, and aux-ram-share=on is " "required.", memory_region_name(rb->mr)); - migrate_add_blocker_modes(&rb->cpr_blocker, errp, MIG_MODE_CPR_TRANSFER, - -1); + migrate_add_blocker_modes(&rb->cpr_blocker, BIT(MIG_MODE_CPR_TRANSFER), + errp); } void ram_block_del_cpr_blocker(RAMBlock *rb) From 5777e20718c6e7088b3e1488857b7d7905c36980 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Mon, 27 Oct 2025 07:45:03 +0100 Subject: [PATCH 20/36] migration: Put Error **errp parameter last qapi/error.h's big comment: * - Functions that use Error to report errors have an Error **errp * parameter. It should be the last parameter, except for functions * taking variable arguments. is_only_migratable() and add_blockers() have it in the middle. Clean them up. Signed-off-by: Markus Armbruster Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20251027064503.1074255-4-armbru@redhat.com Signed-off-by: Peter Xu --- migration/migration.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index f613b95287..5e74993b46 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1867,7 +1867,7 @@ static bool is_busy(Error **reasonp, Error **errp) return false; } -static bool is_only_migratable(Error **reasonp, Error **errp, unsigned modes) +static bool is_only_migratable(Error **reasonp, unsigned modes, Error **errp) { ERRP_GUARD(); @@ -1881,7 +1881,7 @@ static bool is_only_migratable(Error **reasonp, Error **errp, unsigned modes) return false; } -static int add_blockers(Error **reasonp, Error **errp, unsigned modes) +static int add_blockers(Error **reasonp, unsigned modes, Error **errp) { for (MigMode mode = 0; mode < MIG_MODE__MAX; mode++) { if (modes & BIT(mode)) { @@ -1904,12 +1904,12 @@ int migrate_add_blocker_normal(Error **reasonp, Error **errp) int migrate_add_blocker_modes(Error **reasonp, unsigned modes, Error **errp) { - if (is_only_migratable(reasonp, errp, modes)) { + if (is_only_migratable(reasonp, modes, errp)) { return -EACCES; } else if (is_busy(reasonp, errp)) { return -EBUSY; } - return add_blockers(reasonp, errp, modes); + return add_blockers(reasonp, modes, errp); } int migrate_add_blocker_internal(Error **reasonp, Error **errp) @@ -1919,7 +1919,7 @@ int migrate_add_blocker_internal(Error **reasonp, Error **errp) if (is_busy(reasonp, errp)) { return -EBUSY; } - return add_blockers(reasonp, errp, modes); + return add_blockers(reasonp, modes, errp); } void migrate_del_blocker(Error **reasonp) From 1edf0df28409cdf637e5f5ebba515fc63ed4926f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 22 Oct 2025 15:26:00 -0400 Subject: [PATCH 21/36] io: Add qio_channel_wait_cond() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the helper to wait for QIO channel's IO availability in any context (coroutine, or non-coroutine). Use it tree-wide for three occurences. Cc: Daniel P. Berrangé Reviewed-by: Daniel P. Berrangé Reviewed-by: Vladimir Sementsov-Ogievskiy Link: https://lore.kernel.org/r/20251022192612.2737648-2-peterx@redhat.com Signed-off-by: Peter Xu --- include/io/channel.h | 15 +++++++++++++++ io/channel.c | 21 +++++++++++---------- migration/qemu-file.c | 6 +----- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/include/io/channel.h b/include/io/channel.h index 0f25ae0069..d6d5bf2b5f 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -871,6 +871,21 @@ void qio_channel_wake_read(QIOChannel *ioc); void qio_channel_wait(QIOChannel *ioc, GIOCondition condition); +/** + * qio_channel_wait_cond: + * @ioc: the channel object + * @condition: the I/O condition to wait for + * + * Block execution from the current thread until + * the condition indicated by @condition becomes + * available. + * + * This will work with/without a coroutine context, by automatically select + * the proper API to wait. + */ +void qio_channel_wait_cond(QIOChannel *ioc, + GIOCondition condition); + /** * qio_channel_set_aio_fd_handler: * @ioc: the channel object diff --git a/io/channel.c b/io/channel.c index 852e684938..b18fc346ff 100644 --- a/io/channel.c +++ b/io/channel.c @@ -159,11 +159,7 @@ int coroutine_mixed_fn qio_channel_readv_full_all_eof(QIOChannel *ioc, len = qio_channel_readv_full(ioc, local_iov, nlocal_iov, local_fds, local_nfds, flags, errp); if (len == QIO_CHANNEL_ERR_BLOCK) { - if (qemu_in_coroutine()) { - qio_channel_yield(ioc, G_IO_IN); - } else { - qio_channel_wait(ioc, G_IO_IN); - } + qio_channel_wait_cond(ioc, G_IO_IN); continue; } @@ -268,11 +264,7 @@ int coroutine_mixed_fn qio_channel_writev_full_all(QIOChannel *ioc, nfds, flags, errp); if (len == QIO_CHANNEL_ERR_BLOCK) { - if (qemu_in_coroutine()) { - qio_channel_yield(ioc, G_IO_OUT); - } else { - qio_channel_wait(ioc, G_IO_OUT); - } + qio_channel_wait_cond(ioc, G_IO_OUT); continue; } if (len < 0) { @@ -774,6 +766,15 @@ void qio_channel_wait(QIOChannel *ioc, g_main_context_unref(ctxt); } +void qio_channel_wait_cond(QIOChannel *ioc, + GIOCondition condition) +{ + if (qemu_in_coroutine()) { + qio_channel_yield(ioc, condition); + } else { + qio_channel_wait(ioc, condition); + } +} static void qio_channel_finalize(Object *obj) { diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 2d4ce174a5..4b5a409a80 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -343,11 +343,7 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f) QIO_CHANNEL_READ_FLAG_FD_PRESERVE_BLOCKING, &local_error); if (len == QIO_CHANNEL_ERR_BLOCK) { - if (qemu_in_coroutine()) { - qio_channel_yield(f->ioc, G_IO_IN); - } else { - qio_channel_wait(f->ioc, G_IO_IN); - } + qio_channel_wait_cond(f->ioc, G_IO_IN); } } while (len == QIO_CHANNEL_ERR_BLOCK); From 604bb1badcade5f49264cf8901c2f3a528df305a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 22 Oct 2025 15:26:01 -0400 Subject: [PATCH 22/36] migration: Properly wait on G_IO_IN when peeking messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit migration_channel_read_peek() used to do explicit waits of a short period when peeking message needs retry. Replace it with explicit polls on the io channel, exactly like what qemu_fill_buffer() does. Reviewed-by: Daniel P. Berrangé Reviewed-by: Vladimir Sementsov-Ogievskiy Link: https://lore.kernel.org/r/20251022192612.2737648-3-peterx@redhat.com Signed-off-by: Peter Xu --- migration/channel.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/migration/channel.c b/migration/channel.c index a547b1fbfe..462cc183e1 100644 --- a/migration/channel.c +++ b/migration/channel.c @@ -135,12 +135,7 @@ int migration_channel_read_peek(QIOChannel *ioc, break; } - /* 1ms sleep. */ - if (qemu_in_coroutine()) { - qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000); - } else { - g_usleep(1000); - } + qio_channel_wait_cond(ioc, G_IO_IN); } return 0; From d4b3a3cc55845685c16d4313ff345c392262d940 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 28 Oct 2025 16:07:37 +0300 Subject: [PATCH 23/36] migration: vmstate_save_state_v(): fix error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case of pre_save_errp, on error, we continue processing fields, unlike case of pre_save, where we return immediately. Behavior for pre_save_errp case is wrong, we must return here, like for pre_save. "migration: Add error-parameterized function variants in VMSD struct" Fixes: 40de712a89 Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Stefan Berger Link: https://lore.kernel.org/r/20251028130738.29037-2-vsementsov@yandex-team.ru Signed-off-by: Peter Xu --- migration/vmstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/migration/vmstate.c b/migration/vmstate.c index 81eadde553..fd066f910e 100644 --- a/migration/vmstate.c +++ b/migration/vmstate.c @@ -443,6 +443,7 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd, if (ret < 0) { error_prepend(errp, "pre-save for %s failed, ret: %d: ", vmsd->name, ret); + return ret; } } else if (vmsd->pre_save) { ret = vmsd->pre_save(opaque); From 3469a56fa3dc98549d87535a80bb4bddfed44936 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 28 Oct 2025 16:07:38 +0300 Subject: [PATCH 24/36] tmp_emulator: improve and fix use of errp tpm_emulator_post_load() and tpm_emulator_set_state_blobs() has error paths, where they return negative value, but do not set errp. To fix that, we also have to convert several other functions to set errp instead of error_reporting. Signed-off-by: Vladimir Sementsov-Ogievskiy Link: https://lore.kernel.org/r/20251028130738.29037-3-vsementsov@yandex-team.ru Signed-off-by: Peter Xu --- backends/tpm/tpm_emulator.c | 63 +++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/backends/tpm/tpm_emulator.c b/backends/tpm/tpm_emulator.c index f32b67dac7..da9c1056ed 100644 --- a/backends/tpm/tpm_emulator.c +++ b/backends/tpm/tpm_emulator.c @@ -308,22 +308,22 @@ static int tpm_emulator_check_caps(TPMEmulator *tpm_emu) return 0; } -static int tpm_emulator_stop_tpm(TPMBackend *tb) +static int tpm_emulator_stop_tpm(TPMBackend *tb, Error **errp) { TPMEmulator *tpm_emu = TPM_EMULATOR(tb); ptm_res res; if (tpm_emulator_ctrlcmd(tpm_emu, CMD_STOP, &res, 0, sizeof(ptm_res), sizeof(res)) < 0) { - error_report("tpm-emulator: Could not stop TPM: %s", - strerror(errno)); + error_setg(errp, "tpm-emulator: Could not stop TPM: %s", + strerror(errno)); return -1; } res = be32_to_cpu(res); if (res) { - error_report("tpm-emulator: TPM result for CMD_STOP: 0x%x %s", res, - tpm_emulator_strerror(res)); + error_setg(errp, "tpm-emulator: TPM result for CMD_STOP: 0x%x %s", res, + tpm_emulator_strerror(res)); return -1; } @@ -362,12 +362,13 @@ static int tpm_emulator_lock_storage(TPMEmulator *tpm_emu) static int tpm_emulator_set_buffer_size(TPMBackend *tb, size_t wanted_size, - size_t *actual_size) + size_t *actual_size, + Error **errp) { TPMEmulator *tpm_emu = TPM_EMULATOR(tb); ptm_setbuffersize psbs; - if (tpm_emulator_stop_tpm(tb) < 0) { + if (tpm_emulator_stop_tpm(tb, errp) < 0) { return -1; } @@ -376,16 +377,17 @@ static int tpm_emulator_set_buffer_size(TPMBackend *tb, if (tpm_emulator_ctrlcmd(tpm_emu, CMD_SET_BUFFERSIZE, &psbs, sizeof(psbs.u.req), sizeof(psbs.u.resp.tpm_result), sizeof(psbs.u.resp)) < 0) { - error_report("tpm-emulator: Could not set buffer size: %s", - strerror(errno)); + error_setg(errp, "tpm-emulator: Could not set buffer size: %s", + strerror(errno)); return -1; } psbs.u.resp.tpm_result = be32_to_cpu(psbs.u.resp.tpm_result); if (psbs.u.resp.tpm_result != 0) { - error_report("tpm-emulator: TPM result for set buffer size : 0x%x %s", - psbs.u.resp.tpm_result, - tpm_emulator_strerror(psbs.u.resp.tpm_result)); + error_setg(errp, + "tpm-emulator: TPM result for set buffer size : 0x%x %s", + psbs.u.resp.tpm_result, + tpm_emulator_strerror(psbs.u.resp.tpm_result)); return -1; } @@ -402,7 +404,7 @@ static int tpm_emulator_set_buffer_size(TPMBackend *tb, } static int tpm_emulator_startup_tpm_resume(TPMBackend *tb, size_t buffersize, - bool is_resume) + bool is_resume, Error **errp) { TPMEmulator *tpm_emu = TPM_EMULATOR(tb); ptm_init init = { @@ -413,7 +415,7 @@ static int tpm_emulator_startup_tpm_resume(TPMBackend *tb, size_t buffersize, trace_tpm_emulator_startup_tpm_resume(is_resume, buffersize); if (buffersize != 0 && - tpm_emulator_set_buffer_size(tb, buffersize, NULL) < 0) { + tpm_emulator_set_buffer_size(tb, buffersize, NULL, errp) < 0) { goto err_exit; } @@ -424,15 +426,15 @@ static int tpm_emulator_startup_tpm_resume(TPMBackend *tb, size_t buffersize, if (tpm_emulator_ctrlcmd(tpm_emu, CMD_INIT, &init, sizeof(init), sizeof(init.u.resp.tpm_result), sizeof(init)) < 0) { - error_report("tpm-emulator: could not send INIT: %s", - strerror(errno)); + error_setg(errp, "tpm-emulator: could not send INIT: %s", + strerror(errno)); goto err_exit; } res = be32_to_cpu(init.u.resp.tpm_result); if (res) { - error_report("tpm-emulator: TPM result for CMD_INIT: 0x%x %s", res, - tpm_emulator_strerror(res)); + error_setg(errp, "tpm-emulator: TPM result for CMD_INIT: 0x%x %s", res, + tpm_emulator_strerror(res)); goto err_exit; } return 0; @@ -441,18 +443,31 @@ err_exit: return -1; } -static int tpm_emulator_startup_tpm(TPMBackend *tb, size_t buffersize) +static int do_tpm_emulator_startup_tpm(TPMBackend *tb, size_t buffersize, + Error **errp) { /* TPM startup will be done from post_load hook */ if (runstate_check(RUN_STATE_INMIGRATE)) { if (buffersize != 0) { - return tpm_emulator_set_buffer_size(tb, buffersize, NULL); + return tpm_emulator_set_buffer_size(tb, buffersize, NULL, errp); } return 0; } - return tpm_emulator_startup_tpm_resume(tb, buffersize, false); + return tpm_emulator_startup_tpm_resume(tb, buffersize, false, errp); +} + +static int tpm_emulator_startup_tpm(TPMBackend *tb, size_t buffersize) +{ + Error *local_err = NULL; + int ret = do_tpm_emulator_startup_tpm(tb, buffersize, &local_err); + + if (ret < 0) { + error_report_err(local_err); + } + + return ret; } static bool tpm_emulator_get_tpm_established_flag(TPMBackend *tb) @@ -546,7 +561,7 @@ static size_t tpm_emulator_get_buffer_size(TPMBackend *tb) { size_t actual_size; - if (tpm_emulator_set_buffer_size(tb, 0, &actual_size) < 0) { + if (tpm_emulator_set_buffer_size(tb, 0, &actual_size, NULL) < 0) { return 4096; } @@ -889,7 +904,7 @@ static int tpm_emulator_set_state_blobs(TPMBackend *tb, Error **errp) trace_tpm_emulator_set_state_blobs(); - if (tpm_emulator_stop_tpm(tb) < 0) { + if (tpm_emulator_stop_tpm(tb, errp) < 0) { trace_tpm_emulator_set_state_blobs_error("Could not stop TPM"); return -EIO; } @@ -960,7 +975,7 @@ static int tpm_emulator_post_load(void *opaque, int version_id, Error **errp) return ret; } - if (tpm_emulator_startup_tpm_resume(tb, 0, true) < 0) { + if (tpm_emulator_startup_tpm_resume(tb, 0, true, errp) < 0) { return -EIO; } From 507685984cf04d216661cfd927773fa53123a0a9 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 28 Oct 2025 20:09:25 +0300 Subject: [PATCH 25/36] migration/vmstate: stop reporting error number for new _errp APIs The handlers .pre_load_errp, .post_load_errp and .pre_save_errp should put all needed information into errp, we should not append error number here. Note, that there are some more error messages with numeric error codes in this file. We leave them for another day, our current goal is to prepare for the following commit, which will update interface of _errp() APIs. Signed-off-by: Vladimir Sementsov-Ogievskiy Link: https://lore.kernel.org/r/20251028170926.77219-1-vsementsov@yandex-team.ru Signed-off-by: Peter Xu --- migration/vmstate.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/migration/vmstate.c b/migration/vmstate.c index fd066f910e..677e56c84a 100644 --- a/migration/vmstate.c +++ b/migration/vmstate.c @@ -157,9 +157,9 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, ret = vmsd->pre_load_errp(opaque, errp); if (ret < 0) { error_prepend(errp, "pre load hook failed for: '%s', " - "version_id: %d, minimum version_id: %d, " - "ret: %d: ", vmsd->name, vmsd->version_id, - vmsd->minimum_version_id, ret); + "version_id: %d, minimum version_id: %d: ", + vmsd->name, vmsd->version_id, + vmsd->minimum_version_id); return ret; } } else if (vmsd->pre_load) { @@ -259,8 +259,8 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, ret = vmsd->post_load_errp(opaque, version_id, errp); if (ret < 0) { error_prepend(errp, "post load hook failed for: %s, version_id: " - "%d, minimum_version: %d, ret: %d: ", vmsd->name, - vmsd->version_id, vmsd->minimum_version_id, ret); + "%d, minimum_version: %d: ", vmsd->name, + vmsd->version_id, vmsd->minimum_version_id); } } else if (vmsd->post_load) { ret = vmsd->post_load(opaque, version_id); @@ -441,8 +441,7 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd, ret = vmsd->pre_save_errp(opaque, errp); trace_vmstate_save_state_pre_save_res(vmsd->name, ret); if (ret < 0) { - error_prepend(errp, "pre-save for %s failed, ret: %d: ", - vmsd->name, ret); + error_prepend(errp, "pre-save for %s failed: ", vmsd->name); return ret; } } else if (vmsd->pre_save) { From 8c3843638cbab2da66d3e9e662c03271d3a9490e Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 28 Oct 2025 20:09:26 +0300 Subject: [PATCH 26/36] migration: vmsd errp handlers: return bool No code actually depend on specific errno values returned by vmstate_load_state. The only use of it is to check for success, and sometimes inject numeric error values into error messages in migration code. The latter is not a stopper for gradual conversion to "errp + bool return value" APIs. Big analysis of vmstate_load_state() callers, showing that specific errno values are not actually used, is done by Peter here: https://lore.kernel.org/qemu-devel/aQDdRn8t0B8oE3gf@x1.local/ Converting of vmstate_load_state() itself will follow in another series. Signed-off-by: Vladimir Sementsov-Ogievskiy Link: https://lore.kernel.org/r/20251028170926.77219-2-vsementsov@yandex-team.ru Signed-off-by: Peter Xu --- backends/tpm/tpm_emulator.c | 10 ++++------ docs/devel/migration/main.rst | 6 +++--- include/migration/vmstate.h | 6 +++--- migration/vmstate.c | 11 +++++------ 4 files changed, 15 insertions(+), 18 deletions(-) diff --git a/backends/tpm/tpm_emulator.c b/backends/tpm/tpm_emulator.c index da9c1056ed..f10b9074fb 100644 --- a/backends/tpm/tpm_emulator.c +++ b/backends/tpm/tpm_emulator.c @@ -962,24 +962,22 @@ static void tpm_emulator_vm_state_change(void *opaque, bool running, /* * Load the TPM state blobs into the TPM. - * - * Returns negative errno codes in case of error. */ -static int tpm_emulator_post_load(void *opaque, int version_id, Error **errp) +static bool tpm_emulator_post_load(void *opaque, int version_id, Error **errp) { TPMBackend *tb = opaque; int ret; ret = tpm_emulator_set_state_blobs(tb, errp); if (ret < 0) { - return ret; + return false; } if (tpm_emulator_startup_tpm_resume(tb, 0, true, errp) < 0) { - return -EIO; + return false; } - return 0; + return true; } static const VMStateDescription vmstate_tpm_emulator = { diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index 1afe7b9689..234d280249 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -446,15 +446,15 @@ The functions to do that are inside a vmstate definition, and are called: Following are the errp variants of these functions. -- ``int (*pre_load_errp)(void *opaque, Error **errp);`` +- ``bool (*pre_load_errp)(void *opaque, Error **errp);`` This function is called before we load the state of one device. -- ``int (*post_load_errp)(void *opaque, int version_id, Error **errp);`` +- ``bool (*post_load_errp)(void *opaque, int version_id, Error **errp);`` This function is called after we load the state of one device. -- ``int (*pre_save_errp)(void *opaque, Error **errp);`` +- ``bool (*pre_save_errp)(void *opaque, Error **errp);`` This function is called before we save the state of one device. diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h index 09f1eefcfb..df57e6550a 100644 --- a/include/migration/vmstate.h +++ b/include/migration/vmstate.h @@ -218,11 +218,11 @@ struct VMStateDescription { int minimum_version_id; MigrationPriority priority; int (*pre_load)(void *opaque); - int (*pre_load_errp)(void *opaque, Error **errp); + bool (*pre_load_errp)(void *opaque, Error **errp); int (*post_load)(void *opaque, int version_id); - int (*post_load_errp)(void *opaque, int version_id, Error **errp); + bool (*post_load_errp)(void *opaque, int version_id, Error **errp); int (*pre_save)(void *opaque); - int (*pre_save_errp)(void *opaque, Error **errp); + bool (*pre_save_errp)(void *opaque, Error **errp); int (*post_save)(void *opaque); bool (*needed)(void *opaque); bool (*dev_unplug_pending)(void *opaque); diff --git a/migration/vmstate.c b/migration/vmstate.c index 677e56c84a..4d28364f7b 100644 --- a/migration/vmstate.c +++ b/migration/vmstate.c @@ -154,13 +154,12 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, return -EINVAL; } if (vmsd->pre_load_errp) { - ret = vmsd->pre_load_errp(opaque, errp); - if (ret < 0) { + if (!vmsd->pre_load_errp(opaque, errp)) { error_prepend(errp, "pre load hook failed for: '%s', " "version_id: %d, minimum version_id: %d: ", vmsd->name, vmsd->version_id, vmsd->minimum_version_id); - return ret; + return -EINVAL; } } else if (vmsd->pre_load) { ret = vmsd->pre_load(opaque); @@ -256,11 +255,11 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, return ret; } if (vmsd->post_load_errp) { - ret = vmsd->post_load_errp(opaque, version_id, errp); - if (ret < 0) { + if (!vmsd->post_load_errp(opaque, version_id, errp)) { error_prepend(errp, "post load hook failed for: %s, version_id: " "%d, minimum_version: %d: ", vmsd->name, vmsd->version_id, vmsd->minimum_version_id); + ret = -EINVAL; } } else if (vmsd->post_load) { ret = vmsd->post_load(opaque, version_id); @@ -438,7 +437,7 @@ int vmstate_save_state_v(QEMUFile *f, const VMStateDescription *vmsd, trace_vmstate_save_state_top(vmsd->name); if (vmsd->pre_save_errp) { - ret = vmsd->pre_save_errp(opaque, errp); + ret = vmsd->pre_save_errp(opaque, errp) ? 0 : -EINVAL; trace_vmstate_save_state_pre_save_res(vmsd->name, ret); if (ret < 0) { error_prepend(errp, "pre-save for %s failed: ", vmsd->name); From ac7a5f1b72734491318e113a3c64cce103fe93b1 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 30 Oct 2025 10:26:38 +0100 Subject: [PATCH 27/36] scripts/vmstate-static-checker: Fix deprecation warnings with latest argparse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The argparse.FileType() type has been deprecated in the latest argparse version (e.g. the one from Fedora 43), now causing the test_bad_vmstate functional test to fail since there are unexpected strings in the output. Change the script to use pathlib.Path instead to fix the test_bad_vmstate test and to be prepared for the future when the deprecated FileType gets removed completely. Reported-by: Daniel P. Berrangé Signed-off-by: Thomas Huth Reviewed-by: Daniel P. Berrangé Link: https://lore.kernel.org/r/20251030092638.39505-1-thuth@redhat.com Signed-off-by: Peter Xu --- scripts/vmstate-static-checker.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/vmstate-static-checker.py b/scripts/vmstate-static-checker.py index 2335e25f94..89b100e6cc 100755 --- a/scripts/vmstate-static-checker.py +++ b/scripts/vmstate-static-checker.py @@ -21,6 +21,7 @@ import argparse import json +import pathlib import sys # Count the number of errors found @@ -382,10 +383,10 @@ def main(): help_text = "Parse JSON-formatted vmstate dumps from QEMU in files SRC and DEST. Checks whether migration from SRC to DEST QEMU versions would break based on the VMSTATE information contained within the JSON outputs. The JSON output is created from a QEMU invocation with the -dump-vmstate parameter and a filename argument to it. Other parameters to QEMU do not matter, except the -M (machine type) parameter." parser = argparse.ArgumentParser(description=help_text) - parser.add_argument('-s', '--src', type=argparse.FileType('r'), + parser.add_argument('-s', '--src', type=pathlib.Path, required=True, help='json dump from src qemu') - parser.add_argument('-d', '--dest', type=argparse.FileType('r'), + parser.add_argument('-d', '--dest', type=pathlib.Path, required=True, help='json dump from dest qemu') parser.add_argument('--reverse', required=False, default=False, @@ -393,10 +394,10 @@ def main(): help='reverse the direction') args = parser.parse_args() - src_data = json.load(args.src) - dest_data = json.load(args.dest) - args.src.close() - args.dest.close() + with open(args.src, 'r', encoding='utf-8') as src_fh: + src_data = json.load(src_fh) + with open(args.dest, 'r', encoding='utf-8') as dst_fh: + dest_data = json.load(dst_fh) if args.reverse: temp = src_data From c0c6a6ac7bd61f862d9c1800129c80dc2a93704e Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 29 Oct 2025 14:52:24 -0400 Subject: [PATCH 28/36] system/physmem: mark io_mem_unassigned lockless MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the Bus Master bit is disabled in a PCI device's Command Register, the device's DMA address space becomes unassigned memory (i.e. the io_mem_unassigned MemoryRegion). This can lead to deadlocks with IOThreads since io_mem_unassigned accesses attempt to acquire the Big QEMU Lock (BQL). For example, virtio-pci devices deadlock in virtio_write_config() -> virtio_pci_stop_ioeventfd() when waiting for the IOThread while holding the BQL. The IOThread is unable to acquire the BQL but the vcpu thread won't release the BQL while waiting for the IOThread. io_mem_unassigned is trivially thread-safe since it has no state, it simply rejects all load/store accesses. Therefore it is safe to enable lockless I/O on io_mem_unassigned to eliminate this deadlock. Here is the backtrace described above: Thread 9 (Thread 0x7fccfcdff6c0 (LWP 247832) "CPU 4/KVM"): #0 0x00007fcd11529d46 in ppoll () from target:/lib64/libc.so.6 #1 0x000056468a1a9bad in ppoll (__fds=, __nfds=, __timeout=0x0, __ss=0x0) at /usr/include/bits/poll2.h:88 #2 0x000056468a18f9d9 in fdmon_poll_wait (ctx=0x5646c6a1dc30, ready_list=0x7fccfcdfb310, timeout=-1) at ../util/fdmon-poll.c:79 #3 0x000056468a18f14f in aio_poll (ctx=, blocking=blocking@entry=true) at ../util/aio-posix.c:730 #4 0x000056468a1ad842 in aio_wait_bh_oneshot (ctx=, cb=cb@entry=0x564689faa420 , opaque=) at ../util/aio-wait.c:85 #5 0x0000564689faaa89 in virtio_blk_stop_ioeventfd (vdev=0x5646c8fd7e90) at ../hw/block/virtio-blk.c:1644 #6 0x0000564689d77880 in virtio_bus_stop_ioeventfd (bus=bus@entry=0x5646c8fd7e08) at ../hw/virtio/virtio-bus.c:264 #7 0x0000564689d780db in virtio_bus_stop_ioeventfd (bus=bus@entry=0x5646c8fd7e08) at ../hw/virtio/virtio-bus.c:256 #8 0x0000564689d7d98a in virtio_pci_stop_ioeventfd (proxy=0x5646c8fcf8e0) at ../hw/virtio/virtio-pci.c:413 #9 virtio_write_config (pci_dev=0x5646c8fcf8e0, address=4, val=, len=) at ../hw/virtio/virtio-pci.c:803 #10 0x0000564689dcb45a in memory_region_write_accessor (mr=mr@entry=0x5646c6dc2d30, addr=3145732, value=value@entry=0x7fccfcdfb528, size=size@entry=2, shift=, mask=mask@entry=65535, attrs=...) at ../system/memory.c:491 #11 0x0000564689dcaeb0 in access_with_adjusted_size (addr=addr@entry=3145732, value=value@entry=0x7fccfcdfb528, size=size@entry=2, access_size_min=, access_size_max=, access_fn=0x564689dcb3f0 , mr=0x5646c6dc2d30, attrs=...) at ../system/memory.c:567 #12 0x0000564689dcb156 in memory_region_dispatch_write (mr=mr@entry=0x5646c6dc2d30, addr=addr@entry=3145732, data=, op=, attrs=attrs@entry=...) at ../system/memory.c:1554 #13 0x0000564689dd389a in flatview_write_continue_step (attrs=..., attrs@entry=..., buf=buf@entry=0x7fcd05b87028 "", mr_addr=3145732, l=l@entry=0x7fccfcdfb5f0, mr=0x5646c6dc2d30, len=2) at ../system/physmem.c:3266 #14 0x0000564689dd3adb in flatview_write_continue (fv=0x7fcadc0d8930, addr=3761242116, attrs=..., ptr=0xe0300004, len=2, mr_addr=, l=, mr=) at ../system/physmem.c:3296 #15 flatview_write (fv=0x7fcadc0d8930, addr=addr@entry=3761242116, attrs=attrs@entry=..., buf=buf@entry=0x7fcd05b87028, len=len@entry=2) at ../system/physmem.c:3327 #16 0x0000564689dd7191 in address_space_write (as=0x56468b433600 , addr=3761242116, attrs=..., buf=0x7fcd05b87028, len=2) at ../system/physmem.c:3447 #17 address_space_rw (as=0x56468b433600 , addr=3761242116, attrs=attrs@entry=..., buf=buf@entry=0x7fcd05b87028, len=2, is_write=) at ../system/physmem.c:3457 #18 0x0000564689ff1ef6 in kvm_cpu_exec (cpu=cpu@entry=0x5646c6dab810) at ../accel/kvm/kvm-all.c:3248 #19 0x0000564689ff32f5 in kvm_vcpu_thread_fn (arg=arg@entry=0x5646c6dab810) at ../accel/kvm/kvm-accel-ops.c:53 #20 0x000056468a19225c in qemu_thread_start (args=0x5646c6db6190) at ../util/qemu-thread-posix.c:393 #21 0x00007fcd114c5b68 in start_thread () from target:/lib64/libc.so.6 #22 0x00007fcd115364e4 in clone () from target:/lib64/libc.so.6 Thread 3 (Thread 0x7fcd0503a6c0 (LWP 247825) "IO iothread1"): #0 0x00007fcd114c2d30 in __lll_lock_wait () from target:/lib64/libc.so.6 #1 0x00007fcd114c8fe2 in pthread_mutex_lock@@GLIBC_2.2.5 () from target:/lib64/libc.so.6 #2 0x000056468a192538 in qemu_mutex_lock_impl (mutex=0x56468b432e60 , file=0x56468a1e26a5 "../system/physmem.c", line=3198) at ../util/qemu-thread-posix.c:94 #3 0x0000564689dc12e2 in bql_lock_impl (file=file@entry=0x56468a1e26a5 "../system/physmem.c", line=line@entry=3198) at ../system/cpus.c:566 #4 0x0000564689ddc151 in prepare_mmio_access (mr=0x56468b433800 ) at ../system/physmem.c:3198 #5 address_space_lduw_internal_cached_slow (cache=, addr=2, attrs=..., result=0x0, endian=DEVICE_LITTLE_ENDIAN) at ../system/memory_ldst.c.inc:211 #6 address_space_lduw_le_cached_slow (cache=, addr=addr@entry=2, attrs=attrs@entry=..., result=result@entry=0x0) at ../system/memory_ldst.c.inc:253 #7 0x0000564689fd692c in address_space_lduw_le_cached (result=0x0, cache=, addr=2, attrs=...) at /var/tmp/qemu/include/exec/memory_ldst_cached.h.inc:35 #8 lduw_le_phys_cached (cache=, addr=2) at /var/tmp/qemu/include/exec/memory_ldst_phys.h.inc:66 #9 virtio_lduw_phys_cached (vdev=, cache=, pa=2) at /var/tmp/qemu/include/hw/virtio/virtio-access.h:166 #10 vring_avail_idx (vq=0x5646c8fe2470) at ../hw/virtio/virtio.c:396 #11 virtio_queue_split_set_notification (vq=0x5646c8fe2470, enable=0) at ../hw/virtio/virtio.c:534 #12 virtio_queue_set_notification (vq=0x5646c8fe2470, enable=0) at ../hw/virtio/virtio.c:595 #13 0x000056468a18e7a8 in poll_set_started (ctx=ctx@entry=0x5646c6c74e30, ready_list=ready_list@entry=0x7fcd050366a0, started=started@entry=true) at ../util/aio-posix.c:247 #14 0x000056468a18f2bb in poll_set_started (ctx=0x5646c6c74e30, ready_list=0x7fcd050366a0, started=true) at ../util/aio-posix.c:226 #15 try_poll_mode (ctx=0x5646c6c74e30, ready_list=0x7fcd050366a0, timeout=) at ../util/aio-posix.c:612 #16 aio_poll (ctx=0x5646c6c74e30, blocking=blocking@entry=true) at ../util/aio-posix.c:689 #17 0x000056468a032c26 in iothread_run (opaque=opaque@entry=0x5646c69f3380) at ../iothread.c:63 #18 0x000056468a19225c in qemu_thread_start (args=0x5646c6c75410) at ../util/qemu-thread-posix.c:393 #19 0x00007fcd114c5b68 in start_thread () from target:/lib64/libc.so.6 #20 0x00007fcd115364e4 in clone () from target:/lib64/libc.so.6 Buglink: https://issues.redhat.com/browse/RHEL-71933 Reported-by: Peixiu Hou Cc: Kevin Wolf Cc: Paolo Bonzini Signed-off-by: Stefan Hajnoczi Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20251029185224.420261-1-stefanha@redhat.com Signed-off-by: Peter Xu --- system/physmem.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/system/physmem.c b/system/physmem.c index a7e2a5d07f..c9869e4049 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -3011,6 +3011,9 @@ static void io_mem_init(void) { memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL, NULL, UINT64_MAX); + + /* Trivially thread-safe since memory accesses are rejected */ + memory_region_enable_lockless_io(&io_mem_unassigned); } AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv) From 1529ec8f5fed0fdeb686c3f235f1b87ff116512e Mon Sep 17 00:00:00 2001 From: Juraj Marcin Date: Mon, 3 Nov 2025 19:32:50 +0100 Subject: [PATCH 29/36] migration: Flush migration channel after sending data of CMD_PACKAGED If the length of the data sent after CMD_PACKAGED is just right, and there is not much data to send afterward, it is possible part of the CMD_PACKAGED payload will get left behind in the sending buffer. This causes the destination side to hang while it tries to load the whole package and initiate postcopy. Signed-off-by: Juraj Marcin Link: https://lore.kernel.org/r/20251103183301.3840862-2-jmarcin@redhat.com Signed-off-by: Peter Xu --- migration/savevm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/migration/savevm.c b/migration/savevm.c index 232cae090b..fa017378db 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1142,6 +1142,7 @@ int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len) qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp); qemu_put_buffer(f, buf, len); + qemu_fflush(f); return 0; } From 26f65c01edcf046d0dcfaccaa24eb62584510e44 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 3 Nov 2025 19:32:51 +0100 Subject: [PATCH 30/36] migration: Do not try to start VM if disk activation fails If a rare split brain happens (e.g. dest QEMU started running somehow, taking shared drive locks), src QEMU may not be able to activate the drives anymore. In this case, src QEMU shouldn't start the VM or it might crash the block layer later with something like: Meanwhile, src QEMU cannot try to continue either even if dest QEMU can release the drive locks (e.g. by QMP "stop"). Because as long as dest QEMU started running, it means dest QEMU's RAM is the only version that is consistent with current status of the shared storage. Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20251103183301.3840862-3-jmarcin@redhat.com Signed-off-by: Peter Xu --- migration/migration.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 5e74993b46..6e647c7c4a 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3526,6 +3526,8 @@ static MigIterateState migration_iteration_run(MigrationState *s) static void migration_iteration_finish(MigrationState *s) { + Error *local_err = NULL; + bql_lock(); /* @@ -3549,11 +3551,28 @@ static void migration_iteration_finish(MigrationState *s) case MIGRATION_STATUS_FAILED: case MIGRATION_STATUS_CANCELLED: case MIGRATION_STATUS_CANCELLING: - /* - * Re-activate the block drives if they're inactivated. Note, COLO - * shouldn't use block_active at all, so it should be no-op there. - */ - migration_block_activate(NULL); + if (!migration_block_activate(&local_err)) { + /* + * Re-activate the block drives if they're inactivated. + * + * If it fails (e.g. in case of a split brain, where dest QEMU + * might have taken some of the drive locks and running!), do + * not start VM, instead wait for mgmt to decide the next step. + * + * If dest already started, it means dest QEMU should contain + * all the data it needs and it properly owns all the drive + * locks. Then even if src QEMU got a FAILED in migration, it + * normally should mean we should treat the migration as + * COMPLETED. + * + * NOTE: it's not safe anymore to start VM on src now even if + * dest would release the drive locks. It's because as long as + * dest started running then only dest QEMU's RAM is consistent + * with the shared storage. + */ + error_free(local_err); + break; + } if (runstate_is_live(s->vm_old_state)) { if (!runstate_check(RUN_STATE_SHUTDOWN)) { vm_start(); From c9aac1ae106e58be1c4f714793ee9170f91c5e7d Mon Sep 17 00:00:00 2001 From: Juraj Marcin Date: Mon, 3 Nov 2025 19:32:52 +0100 Subject: [PATCH 31/36] migration: Move postcopy_ram_listen_thread() to postcopy-ram.c This patch addresses a TODO about moving postcopy_ram_listen_thread() to postcopy file. Signed-off-by: Juraj Marcin Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20251103183301.3840862-4-jmarcin@redhat.com Signed-off-by: Peter Xu --- migration/postcopy-ram.c | 105 ++++++++++++++++++++++++++++++++++++++ migration/postcopy-ram.h | 2 + migration/savevm.c | 107 --------------------------------------- 3 files changed, 107 insertions(+), 107 deletions(-) diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 5471efb4f0..880b11f154 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -2077,3 +2077,108 @@ bool postcopy_is_paused(MigrationStatus status) return status == MIGRATION_STATUS_POSTCOPY_PAUSED || status == MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP; } + +/* + * Triggered by a postcopy_listen command; this thread takes over reading + * the input stream, leaving the main thread free to carry on loading the rest + * of the device state (from RAM). + */ +void *postcopy_ram_listen_thread(void *opaque) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + QEMUFile *f = mis->from_src_file; + int load_res; + MigrationState *migr = migrate_get_current(); + Error *local_err = NULL; + + object_ref(OBJECT(migr)); + + migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, + MIGRATION_STATUS_POSTCOPY_ACTIVE); + qemu_event_set(&mis->thread_sync_event); + trace_postcopy_ram_listen_thread_start(); + + rcu_register_thread(); + /* + * Because we're a thread and not a coroutine we can't yield + * in qemu_file, and thus we must be blocking now. + */ + qemu_file_set_blocking(f, true, &error_fatal); + + /* TODO: sanity check that only postcopiable data will be loaded here */ + load_res = qemu_loadvm_state_main(f, mis, &local_err); + + /* + * This is tricky, but, mis->from_src_file can change after it + * returns, when postcopy recovery happened. In the future, we may + * want a wrapper for the QEMUFile handle. + */ + f = mis->from_src_file; + + /* And non-blocking again so we don't block in any cleanup */ + qemu_file_set_blocking(f, false, &error_fatal); + + trace_postcopy_ram_listen_thread_exit(); + if (load_res < 0) { + qemu_file_set_error(f, load_res); + dirty_bitmap_mig_cancel_incoming(); + if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING && + !migrate_postcopy_ram() && migrate_dirty_bitmaps()) + { + error_report("%s: loadvm failed during postcopy: %d: %s. All states " + "are migrated except dirty bitmaps. Some dirty " + "bitmaps may be lost, and present migrated dirty " + "bitmaps are correctly migrated and valid.", + __func__, load_res, error_get_pretty(local_err)); + g_clear_pointer(&local_err, error_free); + load_res = 0; /* prevent further exit() */ + } else { + error_prepend(&local_err, + "loadvm failed during postcopy: %d: ", load_res); + migrate_set_error(migr, local_err); + g_clear_pointer(&local_err, error_report_err); + migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, + MIGRATION_STATUS_FAILED); + } + } + if (load_res >= 0) { + /* + * This looks good, but it's possible that the device loading in the + * main thread hasn't finished yet, and so we might not be in 'RUN' + * state yet; wait for the end of the main thread. + */ + qemu_event_wait(&mis->main_thread_load_event); + } + postcopy_ram_incoming_cleanup(mis); + + if (load_res < 0) { + /* + * If something went wrong then we have a bad state so exit; + * depending how far we got it might be possible at this point + * to leave the guest running and fire MCEs for pages that never + * arrived as a desperate recovery step. + */ + rcu_unregister_thread(); + exit(EXIT_FAILURE); + } + + migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, + MIGRATION_STATUS_COMPLETED); + /* + * If everything has worked fine, then the main thread has waited + * for us to start, and we're the last use of the mis. + * (If something broke then qemu will have to exit anyway since it's + * got a bad migration state). + */ + bql_lock(); + migration_incoming_state_destroy(); + bql_unlock(); + + rcu_unregister_thread(); + mis->have_listen_thread = false; + postcopy_state_set(POSTCOPY_INCOMING_END); + + object_unref(OBJECT(migr)); + + return NULL; +} diff --git a/migration/postcopy-ram.h b/migration/postcopy-ram.h index ca19433b24..3e26db3e6b 100644 --- a/migration/postcopy-ram.h +++ b/migration/postcopy-ram.h @@ -199,4 +199,6 @@ bool postcopy_is_paused(MigrationStatus status); void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid, RAMBlock *rb); +void *postcopy_ram_listen_thread(void *opaque); + #endif diff --git a/migration/savevm.c b/migration/savevm.c index fa017378db..2f7ed0db64 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -2088,113 +2088,6 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis, return 0; } -/* - * Triggered by a postcopy_listen command; this thread takes over reading - * the input stream, leaving the main thread free to carry on loading the rest - * of the device state (from RAM). - * (TODO:This could do with being in a postcopy file - but there again it's - * just another input loop, not that postcopy specific) - */ -static void *postcopy_ram_listen_thread(void *opaque) -{ - MigrationIncomingState *mis = migration_incoming_get_current(); - QEMUFile *f = mis->from_src_file; - int load_res; - MigrationState *migr = migrate_get_current(); - Error *local_err = NULL; - - object_ref(OBJECT(migr)); - - migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, - MIGRATION_STATUS_POSTCOPY_ACTIVE); - qemu_event_set(&mis->thread_sync_event); - trace_postcopy_ram_listen_thread_start(); - - rcu_register_thread(); - /* - * Because we're a thread and not a coroutine we can't yield - * in qemu_file, and thus we must be blocking now. - */ - qemu_file_set_blocking(f, true, &error_fatal); - - /* TODO: sanity check that only postcopiable data will be loaded here */ - load_res = qemu_loadvm_state_main(f, mis, &local_err); - - /* - * This is tricky, but, mis->from_src_file can change after it - * returns, when postcopy recovery happened. In the future, we may - * want a wrapper for the QEMUFile handle. - */ - f = mis->from_src_file; - - /* And non-blocking again so we don't block in any cleanup */ - qemu_file_set_blocking(f, false, &error_fatal); - - trace_postcopy_ram_listen_thread_exit(); - if (load_res < 0) { - qemu_file_set_error(f, load_res); - dirty_bitmap_mig_cancel_incoming(); - if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING && - !migrate_postcopy_ram() && migrate_dirty_bitmaps()) - { - error_report("%s: loadvm failed during postcopy: %d: %s. All states " - "are migrated except dirty bitmaps. Some dirty " - "bitmaps may be lost, and present migrated dirty " - "bitmaps are correctly migrated and valid.", - __func__, load_res, error_get_pretty(local_err)); - g_clear_pointer(&local_err, error_free); - load_res = 0; /* prevent further exit() */ - } else { - error_prepend(&local_err, - "loadvm failed during postcopy: %d: ", load_res); - migrate_set_error(migr, local_err); - g_clear_pointer(&local_err, error_report_err); - migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, - MIGRATION_STATUS_FAILED); - } - } - if (load_res >= 0) { - /* - * This looks good, but it's possible that the device loading in the - * main thread hasn't finished yet, and so we might not be in 'RUN' - * state yet; wait for the end of the main thread. - */ - qemu_event_wait(&mis->main_thread_load_event); - } - postcopy_ram_incoming_cleanup(mis); - - if (load_res < 0) { - /* - * If something went wrong then we have a bad state so exit; - * depending how far we got it might be possible at this point - * to leave the guest running and fire MCEs for pages that never - * arrived as a desperate recovery step. - */ - rcu_unregister_thread(); - exit(EXIT_FAILURE); - } - - migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, - MIGRATION_STATUS_COMPLETED); - /* - * If everything has worked fine, then the main thread has waited - * for us to start, and we're the last use of the mis. - * (If something broke then qemu will have to exit anyway since it's - * got a bad migration state). - */ - bql_lock(); - migration_incoming_state_destroy(); - bql_unlock(); - - rcu_unregister_thread(); - mis->have_listen_thread = false; - postcopy_state_set(POSTCOPY_INCOMING_END); - - object_unref(OBJECT(migr)); - - return NULL; -} - /* After this message we must be able to immediately receive postcopy data */ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis, Error **errp) From 67474ebacce381e51fa3e9c71af37b68faf4860a Mon Sep 17 00:00:00 2001 From: Juraj Marcin Date: Mon, 3 Nov 2025 19:32:53 +0100 Subject: [PATCH 32/36] migration: Introduce postcopy incoming setup and cleanup functions After moving postcopy_ram_listen_thread() to postcopy file, this patch introduces a pair of functions, postcopy_incoming_setup() and postcopy_incoming_cleanup(). These functions encapsulate setup and cleanup of all incoming postcopy resources, postcopy-ram and postcopy listen thread. Furthermore, this patch also renames the postcopy_ram_listen_thread to postcopy_listen_thread, as this thread handles not only postcopy-ram, but also dirty-bitmaps and in the future it could handle other postcopiable devices. Signed-off-by: Juraj Marcin Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20251103183301.3840862-5-jmarcin@redhat.com Signed-off-by: Peter Xu --- migration/migration.c | 2 +- migration/postcopy-ram.c | 44 ++++++++++++++++++++++++++++++++++++++-- migration/postcopy-ram.h | 3 ++- migration/savevm.c | 25 ++--------------------- 4 files changed, 47 insertions(+), 27 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 6e647c7c4a..9a367f717e 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -892,7 +892,7 @@ process_incoming_migration_co(void *opaque) * but managed to complete within the precopy period, we can use * the normal exit. */ - postcopy_ram_incoming_cleanup(mis); + postcopy_incoming_cleanup(mis); } else if (ret >= 0) { /* * Postcopy was started, cleanup should happen at the end of the diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 880b11f154..b47c955763 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -2083,7 +2083,7 @@ bool postcopy_is_paused(MigrationStatus status) * the input stream, leaving the main thread free to carry on loading the rest * of the device state (from RAM). */ -void *postcopy_ram_listen_thread(void *opaque) +static void *postcopy_listen_thread(void *opaque) { MigrationIncomingState *mis = migration_incoming_get_current(); QEMUFile *f = mis->from_src_file; @@ -2149,7 +2149,7 @@ void *postcopy_ram_listen_thread(void *opaque) */ qemu_event_wait(&mis->main_thread_load_event); } - postcopy_ram_incoming_cleanup(mis); + postcopy_incoming_cleanup(mis); if (load_res < 0) { /* @@ -2182,3 +2182,43 @@ void *postcopy_ram_listen_thread(void *opaque) return NULL; } + +int postcopy_incoming_setup(MigrationIncomingState *mis, Error **errp) +{ + /* + * Sensitise RAM - can now generate requests for blocks that don't exist + * However, at this point the CPU shouldn't be running, and the IO + * shouldn't be doing anything yet so don't actually expect requests + */ + if (migrate_postcopy_ram()) { + if (postcopy_ram_incoming_setup(mis)) { + postcopy_ram_incoming_cleanup(mis); + error_setg(errp, "Failed to setup incoming postcopy RAM blocks"); + return -1; + } + } + + trace_loadvm_postcopy_handle_listen("after uffd"); + + if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, errp)) { + return -1; + } + + mis->have_listen_thread = true; + postcopy_thread_create(mis, &mis->listen_thread, + MIGRATION_THREAD_DST_LISTEN, + postcopy_listen_thread, QEMU_THREAD_DETACHED); + + return 0; +} + +int postcopy_incoming_cleanup(MigrationIncomingState *mis) +{ + int rc = 0; + + if (migrate_postcopy_ram()) { + rc = postcopy_ram_incoming_cleanup(mis); + } + + return rc; +} diff --git a/migration/postcopy-ram.h b/migration/postcopy-ram.h index 3e26db3e6b..a080dd65a7 100644 --- a/migration/postcopy-ram.h +++ b/migration/postcopy-ram.h @@ -199,6 +199,7 @@ bool postcopy_is_paused(MigrationStatus status); void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid, RAMBlock *rb); -void *postcopy_ram_listen_thread(void *opaque); +int postcopy_incoming_setup(MigrationIncomingState *mis, Error **errp); +int postcopy_incoming_cleanup(MigrationIncomingState *mis); #endif diff --git a/migration/savevm.c b/migration/savevm.c index 2f7ed0db64..01b5a8bfff 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -2113,32 +2113,11 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis, trace_loadvm_postcopy_handle_listen("after discard"); - /* - * Sensitise RAM - can now generate requests for blocks that don't exist - * However, at this point the CPU shouldn't be running, and the IO - * shouldn't be doing anything yet so don't actually expect requests - */ - if (migrate_postcopy_ram()) { - if (postcopy_ram_incoming_setup(mis)) { - postcopy_ram_incoming_cleanup(mis); - error_setg(errp, "Failed to setup incoming postcopy RAM blocks"); - return -1; - } - } + int rc = postcopy_incoming_setup(mis, errp); - trace_loadvm_postcopy_handle_listen("after uffd"); - - if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, errp)) { - return -1; - } - - mis->have_listen_thread = true; - postcopy_thread_create(mis, &mis->listen_thread, - MIGRATION_THREAD_DST_LISTEN, - postcopy_ram_listen_thread, QEMU_THREAD_DETACHED); trace_loadvm_postcopy_handle_listen("return"); - return 0; + return rc; } static void loadvm_postcopy_handle_run_bh(void *opaque) From 468a7effdefba22ac905297b2aa02b5781587f40 Mon Sep 17 00:00:00 2001 From: Juraj Marcin Date: Mon, 3 Nov 2025 19:32:54 +0100 Subject: [PATCH 33/36] migration: Refactor all incoming cleanup info migration_incoming_destroy() Currently, there are two functions that are responsible for calling the cleanup of the incoming migration state. With successful precopy, it's the incoming migration coroutine, and with successful postcopy it's the postcopy listen thread. However, if postcopy fails during in the device load, both functions will try to do the cleanup. This patch refactors all cleanup that needs to be done on the incoming side into a common function and defines a clear boundary, who is responsible for the cleanup. The incoming migration coroutine is responsible for calling the cleanup function, unless the listen thread has been started, in which case the postcopy listen thread runs the incoming migration cleanup in its BH. Signed-off-by: Juraj Marcin Fixes: 9535435795 ("migration: push Error **errp into qemu_loadvm_state()") Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20251103183301.3840862-6-jmarcin@redhat.com Signed-off-by: Peter Xu --- migration/migration.c | 44 +++++++++------------------- migration/migration.h | 1 + migration/postcopy-ram.c | 63 +++++++++++++++++++++------------------- migration/trace-events | 2 +- 4 files changed, 49 insertions(+), 61 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 9a367f717e..637be71bfe 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -438,10 +438,15 @@ void migration_incoming_transport_cleanup(MigrationIncomingState *mis) void migration_incoming_state_destroy(void) { - struct MigrationIncomingState *mis = migration_incoming_get_current(); + MigrationIncomingState *mis = migration_incoming_get_current(); + PostcopyState ps = postcopy_state_get(); multifd_recv_cleanup(); + if (ps != POSTCOPY_INCOMING_NONE) { + postcopy_incoming_cleanup(mis); + } + /* * RAM state cleanup needs to happen after multifd cleanup, because * multifd threads can use some of its states (receivedmap). @@ -866,7 +871,6 @@ process_incoming_migration_co(void *opaque) { MigrationState *s = migrate_get_current(); MigrationIncomingState *mis = migration_incoming_get_current(); - PostcopyState ps; int ret; Error *local_err = NULL; @@ -883,25 +887,14 @@ process_incoming_migration_co(void *opaque) trace_vmstate_downtime_checkpoint("dst-precopy-loadvm-completed"); - ps = postcopy_state_get(); - trace_process_incoming_migration_co_end(ret, ps); - if (ps != POSTCOPY_INCOMING_NONE) { - if (ps == POSTCOPY_INCOMING_ADVISE) { - /* - * Where a migration had postcopy enabled (and thus went to advise) - * but managed to complete within the precopy period, we can use - * the normal exit. - */ - postcopy_incoming_cleanup(mis); - } else if (ret >= 0) { - /* - * Postcopy was started, cleanup should happen at the end of the - * postcopy thread. - */ - trace_process_incoming_migration_co_postcopy_end_main(); - goto out; - } - /* Else if something went wrong then just fall out of the normal exit */ + trace_process_incoming_migration_co_end(ret); + if (mis->have_listen_thread) { + /* + * Postcopy was started, cleanup should happen at the end of the + * postcopy listen thread. + */ + trace_process_incoming_migration_co_postcopy_end_main(); + goto out; } if (ret < 0) { @@ -933,15 +926,6 @@ fail: } exit(EXIT_FAILURE); - } else { - /* - * Report the error here in case that QEMU abruptly exits - * when postcopy is enabled. - */ - WITH_QEMU_LOCK_GUARD(&s->error_mutex) { - error_report_err(s->error); - s->error = NULL; - } } out: /* Pairs with the refcount taken in qmp_migrate_incoming() */ diff --git a/migration/migration.h b/migration/migration.h index 01329bf824..4a37f7202c 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -254,6 +254,7 @@ struct MigrationIncomingState { MigrationIncomingState *migration_incoming_get_current(void); void migration_incoming_state_destroy(void); void migration_incoming_transport_cleanup(MigrationIncomingState *mis); +void migration_incoming_qemu_exit(void); /* * Functions to work with blocktime context */ diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index b47c955763..48cbb46c27 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -2078,6 +2078,24 @@ bool postcopy_is_paused(MigrationStatus status) status == MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP; } +static void postcopy_listen_thread_bh(void *opaque) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + + migration_incoming_state_destroy(); + + if (mis->state == MIGRATION_STATUS_FAILED) { + /* + * If something went wrong then we have a bad state so exit; + * we only could have gotten here if something failed before + * POSTCOPY_INCOMING_RUNNING (for example device load), otherwise + * postcopy migration would pause inside qemu_loadvm_state_main(). + * Failing dirty-bitmaps won't fail the whole migration. + */ + exit(1); + } +} + /* * Triggered by a postcopy_listen command; this thread takes over reading * the input stream, leaving the main thread free to carry on loading the rest @@ -2131,53 +2149,38 @@ static void *postcopy_listen_thread(void *opaque) "bitmaps are correctly migrated and valid.", __func__, load_res, error_get_pretty(local_err)); g_clear_pointer(&local_err, error_free); - load_res = 0; /* prevent further exit() */ } else { + /* + * Something went fatally wrong and we have a bad state, QEMU will + * exit depending on if postcopy-exit-on-error is true, but the + * migration cannot be recovered. + */ error_prepend(&local_err, "loadvm failed during postcopy: %d: ", load_res); migrate_set_error(migr, local_err); g_clear_pointer(&local_err, error_report_err); migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, MIGRATION_STATUS_FAILED); + goto out; } } - if (load_res >= 0) { - /* - * This looks good, but it's possible that the device loading in the - * main thread hasn't finished yet, and so we might not be in 'RUN' - * state yet; wait for the end of the main thread. - */ - qemu_event_wait(&mis->main_thread_load_event); - } - postcopy_incoming_cleanup(mis); - - if (load_res < 0) { - /* - * If something went wrong then we have a bad state so exit; - * depending how far we got it might be possible at this point - * to leave the guest running and fire MCEs for pages that never - * arrived as a desperate recovery step. - */ - rcu_unregister_thread(); - exit(EXIT_FAILURE); - } + /* + * This looks good, but it's possible that the device loading in the + * main thread hasn't finished yet, and so we might not be in 'RUN' + * state yet; wait for the end of the main thread. + */ + qemu_event_wait(&mis->main_thread_load_event); migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, MIGRATION_STATUS_COMPLETED); - /* - * If everything has worked fine, then the main thread has waited - * for us to start, and we're the last use of the mis. - * (If something broke then qemu will have to exit anyway since it's - * got a bad migration state). - */ - bql_lock(); - migration_incoming_state_destroy(); - bql_unlock(); +out: rcu_unregister_thread(); mis->have_listen_thread = false; postcopy_state_set(POSTCOPY_INCOMING_END); + migration_bh_schedule(postcopy_listen_thread_bh, NULL); + object_unref(OBJECT(migr)); return NULL; diff --git a/migration/trace-events b/migration/trace-events index e8edd1fbba..772636f3ac 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -193,7 +193,7 @@ source_return_path_thread_resume_ack(uint32_t v) "%"PRIu32 source_return_path_thread_switchover_acked(void) "" migration_thread_low_pending(uint64_t pending) "%" PRIu64 migrate_transferred(uint64_t transferred, uint64_t time_spent, uint64_t bandwidth, uint64_t avail_bw, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %" PRIu64 " switchover_bw %" PRIu64 " max_size %" PRId64 -process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d" +process_incoming_migration_co_end(int ret) "ret=%d" process_incoming_migration_co_postcopy_end_main(void) "" postcopy_preempt_enabled(bool value) "%d" migration_precopy_complete(void) "" From b1a17a519b20438fb3f8435a2b7fa702f393d075 Mon Sep 17 00:00:00 2001 From: Juraj Marcin Date: Mon, 3 Nov 2025 19:32:55 +0100 Subject: [PATCH 34/36] migration: Respect exit-on-error when migration fails before resuming When exit-on-error was added to migration, it wasn't added to postcopy. Even though postcopy migration will usually pause and not fail, in cases it does unrecoverably fail before destination side has been started, exit-on-error will allow management to query the error. Signed-off-by: Juraj Marcin Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20251103183301.3840862-7-jmarcin@redhat.com Signed-off-by: Peter Xu --- migration/postcopy-ram.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 48cbb46c27..91431f02a4 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -2080,11 +2080,16 @@ bool postcopy_is_paused(MigrationStatus status) static void postcopy_listen_thread_bh(void *opaque) { + MigrationState *s = migrate_get_current(); MigrationIncomingState *mis = migration_incoming_get_current(); migration_incoming_state_destroy(); - if (mis->state == MIGRATION_STATUS_FAILED) { + if (mis->state == MIGRATION_STATUS_FAILED && mis->exit_on_error) { + WITH_QEMU_LOCK_GUARD(&s->error_mutex) { + error_report_err(s->error); + s->error = NULL; + } /* * If something went wrong then we have a bad state so exit; * we only could have gotten here if something failed before From 0680dd185b3265a948e1863cdc52c65f72b994d9 Mon Sep 17 00:00:00 2001 From: Juraj Marcin Date: Mon, 3 Nov 2025 19:32:56 +0100 Subject: [PATCH 35/36] migration: Make postcopy listen thread joinable This patch makes the listen thread joinable instead detached, and joins it alongside other postcopy threads. Signed-off-by: Juraj Marcin Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20251103183301.3840862-8-jmarcin@redhat.com Signed-off-by: Peter Xu --- migration/postcopy-ram.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 91431f02a4..8405cce7b4 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -2181,7 +2181,6 @@ static void *postcopy_listen_thread(void *opaque) out: rcu_unregister_thread(); - mis->have_listen_thread = false; postcopy_state_set(POSTCOPY_INCOMING_END); migration_bh_schedule(postcopy_listen_thread_bh, NULL); @@ -2215,7 +2214,7 @@ int postcopy_incoming_setup(MigrationIncomingState *mis, Error **errp) mis->have_listen_thread = true; postcopy_thread_create(mis, &mis->listen_thread, MIGRATION_THREAD_DST_LISTEN, - postcopy_listen_thread, QEMU_THREAD_DETACHED); + postcopy_listen_thread, QEMU_THREAD_JOINABLE); return 0; } @@ -2224,6 +2223,11 @@ int postcopy_incoming_cleanup(MigrationIncomingState *mis) { int rc = 0; + if (mis->have_listen_thread) { + qemu_thread_join(&mis->listen_thread); + mis->have_listen_thread = false; + } + if (migrate_postcopy_ram()) { rc = postcopy_ram_incoming_cleanup(mis); } From 7b842fe354c63feaffc63c850b28c3610a0c90d2 Mon Sep 17 00:00:00 2001 From: Juraj Marcin Date: Mon, 3 Nov 2025 19:32:57 +0100 Subject: [PATCH 36/36] migration: Introduce POSTCOPY_DEVICE state Currently, when postcopy starts, the source VM starts switchover and sends a package containing the state of all non-postcopiable devices. When the destination loads this package, the switchover is complete and the destination VM starts. However, if the device state load fails or the destination side crashes, the source side is already in POSTCOPY_ACTIVE state and cannot be recovered, even when it has the most up-to-date machine state as the destination has not yet started. This patch introduces a new POSTCOPY_DEVICE state which is active while the destination machine is loading the device state, is not yet running, and the source side can be resumed in case of a migration failure. Return-path is required for this state to function, otherwise it will be skipped in favor of POSTCOPY_ACTIVE. To transition from POSTCOPY_DEVICE to POSTCOPY_ACTIVE, the source side uses a PONG message that is a response to a PING message processed just before the POSTCOPY_RUN command that starts the destination VM. Thus, this feature is effective even if the destination side does not yet support this new state. Signed-off-by: Juraj Marcin Link: https://lore.kernel.org/r/20251103183301.3840862-9-jmarcin@redhat.com Signed-off-by: Peter Xu --- migration/migration.c | 50 ++++++++++++++++++++++++--- migration/migration.h | 3 ++ migration/postcopy-ram.c | 10 ++++-- migration/savevm.c | 5 +++ migration/savevm.h | 2 ++ migration/trace-events | 1 + qapi/migration.json | 10 ++++-- tests/qemu-iotests/194 | 2 +- tests/qtest/migration/precopy-tests.c | 3 +- 9 files changed, 75 insertions(+), 11 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 637be71bfe..c2daab6bdd 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1206,6 +1206,7 @@ bool migration_is_running(void) switch (s->state) { case MIGRATION_STATUS_ACTIVE: + case MIGRATION_STATUS_POSTCOPY_DEVICE: case MIGRATION_STATUS_POSTCOPY_ACTIVE: case MIGRATION_STATUS_POSTCOPY_PAUSED: case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: @@ -1227,6 +1228,7 @@ static bool migration_is_active(void) MigrationState *s = current_migration; return (s->state == MIGRATION_STATUS_ACTIVE || + s->state == MIGRATION_STATUS_POSTCOPY_DEVICE || s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); } @@ -1349,6 +1351,7 @@ static void fill_source_migration_info(MigrationInfo *info) break; case MIGRATION_STATUS_ACTIVE: case MIGRATION_STATUS_CANCELLING: + case MIGRATION_STATUS_POSTCOPY_DEVICE: case MIGRATION_STATUS_POSTCOPY_ACTIVE: case MIGRATION_STATUS_PRE_SWITCHOVER: case MIGRATION_STATUS_DEVICE: @@ -1402,6 +1405,7 @@ static void fill_destination_migration_info(MigrationInfo *info) case MIGRATION_STATUS_CANCELLING: case MIGRATION_STATUS_CANCELLED: case MIGRATION_STATUS_ACTIVE: + case MIGRATION_STATUS_POSTCOPY_DEVICE: case MIGRATION_STATUS_POSTCOPY_ACTIVE: case MIGRATION_STATUS_POSTCOPY_PAUSED: case MIGRATION_STATUS_POSTCOPY_RECOVER: @@ -1732,6 +1736,7 @@ bool migration_in_postcopy(void) MigrationState *s = migrate_get_current(); switch (s->state) { + case MIGRATION_STATUS_POSTCOPY_DEVICE: case MIGRATION_STATUS_POSTCOPY_ACTIVE: case MIGRATION_STATUS_POSTCOPY_PAUSED: case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: @@ -1833,6 +1838,9 @@ int migrate_init(MigrationState *s, Error **errp) memset(&mig_stats, 0, sizeof(mig_stats)); migration_reset_vfio_bytes_transferred(); + s->postcopy_package_loaded = false; + qemu_event_reset(&s->postcopy_package_loaded_event); + return 0; } @@ -2568,6 +2576,11 @@ static void *source_return_path_thread(void *opaque) tmp32 = ldl_be_p(buf); trace_source_return_path_thread_pong(tmp32); qemu_sem_post(&ms->rp_state.rp_pong_acks); + if (tmp32 == QEMU_VM_PING_PACKAGED_LOADED) { + trace_source_return_path_thread_postcopy_package_loaded(); + ms->postcopy_package_loaded = true; + qemu_event_set(&ms->postcopy_package_loaded_event); + } break; case MIG_RP_MSG_REQ_PAGES: @@ -2813,6 +2826,15 @@ static int postcopy_start(MigrationState *ms, Error **errp) if (migrate_postcopy_ram()) { qemu_savevm_send_ping(fb, 3); } + if (ms->rp_state.rp_thread_created) { + /* + * This ping will tell us that all non-postcopiable device state has been + * successfully loaded and the destination is about to start. When + * response is received, it will trigger transition from POSTCOPY_DEVICE + * to POSTCOPY_ACTIVE state. + */ + qemu_savevm_send_ping(fb, QEMU_VM_PING_PACKAGED_LOADED); + } qemu_savevm_send_postcopy_run(fb); @@ -2868,8 +2890,13 @@ static int postcopy_start(MigrationState *ms, Error **errp) */ migration_rate_set(migrate_max_postcopy_bandwidth()); - /* Now, switchover looks all fine, switching to postcopy-active */ + /* + * Now, switchover looks all fine, switching to POSTCOPY_DEVICE, or + * directly to POSTCOPY_ACTIVE if there is no return path. + */ migrate_set_state(&ms->state, MIGRATION_STATUS_DEVICE, + ms->rp_state.rp_thread_created ? + MIGRATION_STATUS_POSTCOPY_DEVICE : MIGRATION_STATUS_POSTCOPY_ACTIVE); bql_unlock(); @@ -3311,8 +3338,8 @@ static MigThrError migration_detect_error(MigrationState *s) return postcopy_pause(s); } else { /* - * For precopy (or postcopy with error outside IO), we fail - * with no time. + * For precopy (or postcopy with error outside IO, or before dest + * starts), we fail with no time. */ migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED); trace_migration_thread_file_err(); @@ -3447,7 +3474,8 @@ static MigIterateState migration_iteration_run(MigrationState *s) { uint64_t must_precopy, can_postcopy, pending_size; Error *local_err = NULL; - bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE; + bool in_postcopy = (s->state == MIGRATION_STATUS_POSTCOPY_DEVICE || + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); bool can_switchover = migration_can_switchover(s); bool complete_ready; @@ -3463,6 +3491,18 @@ static MigIterateState migration_iteration_run(MigrationState *s) * POSTCOPY_ACTIVE it means switchover already happened. */ complete_ready = !pending_size; + if (s->state == MIGRATION_STATUS_POSTCOPY_DEVICE && + (s->postcopy_package_loaded || complete_ready)) { + /* + * If package has been loaded, the event is set and we will + * immediatelly transition to POSTCOPY_ACTIVE. If we are ready for + * completion, we need to wait for destination to load the postcopy + * package before actually completing. + */ + qemu_event_wait(&s->postcopy_package_loaded_event); + migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_DEVICE, + MIGRATION_STATUS_POSTCOPY_ACTIVE); + } } else { /* * Exact pending reporting is only needed for precopy. Taking RAM @@ -4117,6 +4157,7 @@ static void migration_instance_finalize(Object *obj) qemu_sem_destroy(&ms->rp_state.rp_pong_acks); qemu_sem_destroy(&ms->postcopy_qemufile_src_sem); error_free(ms->error); + qemu_event_destroy(&ms->postcopy_package_loaded_event); } static void migration_instance_init(Object *obj) @@ -4138,6 +4179,7 @@ static void migration_instance_init(Object *obj) qemu_sem_init(&ms->wait_unplug_sem, 0); qemu_sem_init(&ms->postcopy_qemufile_src_sem, 0); qemu_mutex_init(&ms->qemu_file_lock); + qemu_event_init(&ms->postcopy_package_loaded_event, 0); } /* diff --git a/migration/migration.h b/migration/migration.h index 4a37f7202c..213b33fe6e 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -510,6 +510,9 @@ struct MigrationState { /* Is this a rdma migration */ bool rdma_migration; + bool postcopy_package_loaded; + QemuEvent postcopy_package_loaded_event; + GSource *hup_source; }; diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 8405cce7b4..3f98dcb6fd 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -2117,7 +2117,8 @@ static void *postcopy_listen_thread(void *opaque) object_ref(OBJECT(migr)); migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, - MIGRATION_STATUS_POSTCOPY_ACTIVE); + mis->to_src_file ? MIGRATION_STATUS_POSTCOPY_DEVICE : + MIGRATION_STATUS_POSTCOPY_ACTIVE); qemu_event_set(&mis->thread_sync_event); trace_postcopy_ram_listen_thread_start(); @@ -2164,8 +2165,7 @@ static void *postcopy_listen_thread(void *opaque) "loadvm failed during postcopy: %d: ", load_res); migrate_set_error(migr, local_err); g_clear_pointer(&local_err, error_report_err); - migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, - MIGRATION_STATUS_FAILED); + migrate_set_state(&mis->state, mis->state, MIGRATION_STATUS_FAILED); goto out; } } @@ -2176,6 +2176,10 @@ static void *postcopy_listen_thread(void *opaque) */ qemu_event_wait(&mis->main_thread_load_event); + /* + * Device load in the main thread has finished, we should be in + * POSTCOPY_ACTIVE now. + */ migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, MIGRATION_STATUS_COMPLETED); diff --git a/migration/savevm.c b/migration/savevm.c index 01b5a8bfff..62cc2ce25c 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -2170,6 +2170,11 @@ static int loadvm_postcopy_handle_run(MigrationIncomingState *mis, Error **errp) return -1; } + /* We might be already in POSTCOPY_ACTIVE if there is no return path */ + if (mis->state == MIGRATION_STATUS_POSTCOPY_DEVICE) { + migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_DEVICE, + MIGRATION_STATUS_POSTCOPY_ACTIVE); + } postcopy_state_set(POSTCOPY_INCOMING_RUNNING); migration_bh_schedule(loadvm_postcopy_handle_run_bh, mis); diff --git a/migration/savevm.h b/migration/savevm.h index c337e3e3d1..125a2507b7 100644 --- a/migration/savevm.h +++ b/migration/savevm.h @@ -29,6 +29,8 @@ #define QEMU_VM_COMMAND 0x08 #define QEMU_VM_SECTION_FOOTER 0x7e +#define QEMU_VM_PING_PACKAGED_LOADED 0x42 + bool qemu_savevm_state_blocked(Error **errp); void qemu_savevm_non_migratable_list(strList **reasons); int qemu_savevm_state_prepare(Error **errp); diff --git a/migration/trace-events b/migration/trace-events index 772636f3ac..bf11b62b17 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -191,6 +191,7 @@ source_return_path_thread_pong(uint32_t val) "0x%x" source_return_path_thread_shut(uint32_t val) "0x%x" source_return_path_thread_resume_ack(uint32_t v) "%"PRIu32 source_return_path_thread_switchover_acked(void) "" +source_return_path_thread_postcopy_package_loaded(void) "" migration_thread_low_pending(uint64_t pending) "%" PRIu64 migrate_transferred(uint64_t transferred, uint64_t time_spent, uint64_t bandwidth, uint64_t avail_bw, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %" PRIu64 " switchover_bw %" PRIu64 " max_size %" PRId64 process_incoming_migration_co_end(int ret) "ret=%d" diff --git a/qapi/migration.json b/qapi/migration.json index c7a6737cc1..93f71de3fe 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -142,6 +142,12 @@ # @postcopy-active: like active, but now in postcopy mode. # (since 2.5) # +# @postcopy-device: like postcopy-active, but the destination is still +# loading device state and is not running yet. If migration fails +# during this state, the source side will resume. If there is no +# return-path from destination to source, this state is skipped. +# (since 10.2) +# # @postcopy-paused: during postcopy but paused. (since 3.0) # # @postcopy-recover-setup: setup phase for a postcopy recovery @@ -173,8 +179,8 @@ ## { 'enum': 'MigrationStatus', 'data': [ 'none', 'setup', 'cancelling', 'cancelled', - 'active', 'postcopy-active', 'postcopy-paused', - 'postcopy-recover-setup', + 'active', 'postcopy-device', 'postcopy-active', + 'postcopy-paused', 'postcopy-recover-setup', 'postcopy-recover', 'completed', 'failed', 'colo', 'pre-switchover', 'device', 'wait-unplug' ] } ## diff --git a/tests/qemu-iotests/194 b/tests/qemu-iotests/194 index e114c0b269..806624394d 100755 --- a/tests/qemu-iotests/194 +++ b/tests/qemu-iotests/194 @@ -76,7 +76,7 @@ with iotests.FilePath('source.img') as source_img_path, \ while True: event1 = source_vm.event_wait('MIGRATION') - if event1['data']['status'] == 'postcopy-active': + if event1['data']['status'] in ('postcopy-device', 'postcopy-active'): # This event is racy, it depends do we really do postcopy or bitmap # was migrated during downtime (and no data to migrate in postcopy # phase). So, don't log it. diff --git a/tests/qtest/migration/precopy-tests.c b/tests/qtest/migration/precopy-tests.c index bb38292550..57ca623de5 100644 --- a/tests/qtest/migration/precopy-tests.c +++ b/tests/qtest/migration/precopy-tests.c @@ -1316,13 +1316,14 @@ void migration_test_add_precopy(MigrationTestEnv *env) } /* ensure new status don't go unnoticed */ - assert(MIGRATION_STATUS__MAX == 15); + assert(MIGRATION_STATUS__MAX == 16); for (int i = MIGRATION_STATUS_NONE; i < MIGRATION_STATUS__MAX; i++) { switch (i) { case MIGRATION_STATUS_DEVICE: /* happens too fast */ case MIGRATION_STATUS_WAIT_UNPLUG: /* no support in tests */ case MIGRATION_STATUS_COLO: /* no support in tests */ + case MIGRATION_STATUS_POSTCOPY_DEVICE: /* postcopy can't be cancelled */ case MIGRATION_STATUS_POSTCOPY_ACTIVE: /* postcopy can't be cancelled */ case MIGRATION_STATUS_POSTCOPY_PAUSED: case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: