diff --git a/migration/migration.c b/migration/migration.c index 5e74993b46..6e647c7c4a 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3526,6 +3526,8 @@ static MigIterateState migration_iteration_run(MigrationState *s) static void migration_iteration_finish(MigrationState *s) { + Error *local_err = NULL; + bql_lock(); /* @@ -3549,11 +3551,28 @@ static void migration_iteration_finish(MigrationState *s) case MIGRATION_STATUS_FAILED: case MIGRATION_STATUS_CANCELLED: case MIGRATION_STATUS_CANCELLING: - /* - * Re-activate the block drives if they're inactivated. Note, COLO - * shouldn't use block_active at all, so it should be no-op there. - */ - migration_block_activate(NULL); + if (!migration_block_activate(&local_err)) { + /* + * Re-activate the block drives if they're inactivated. + * + * If it fails (e.g. in case of a split brain, where dest QEMU + * might have taken some of the drive locks and running!), do + * not start VM, instead wait for mgmt to decide the next step. + * + * If dest already started, it means dest QEMU should contain + * all the data it needs and it properly owns all the drive + * locks. Then even if src QEMU got a FAILED in migration, it + * normally should mean we should treat the migration as + * COMPLETED. + * + * NOTE: it's not safe anymore to start VM on src now even if + * dest would release the drive locks. It's because as long as + * dest started running then only dest QEMU's RAM is consistent + * with the shared storage. + */ + error_free(local_err); + break; + } if (runstate_is_live(s->vm_old_state)) { if (!runstate_check(RUN_STATE_SHUTDOWN)) { vm_start();